diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 234988280..85e0336b0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -50,10 +50,11 @@ jobs:
         -   uses: actions/checkout@v4
         -   name: "Main Script"
             run: |
+                EXTRA_INSTALL="mypy pytest types-colorama types-Pygments"
                 curl -L -O https://tiker.net/ci-support-v0
                 . ./ci-support-v0
+
                 build_py_project_in_conda_env
-                python -m pip install mypy
                 ./run-mypy.sh
 
     pytest:
@@ -194,9 +195,9 @@ jobs:
                 cd /home/firedrake/firedrake/src/firedrake
 
                 # patch so exception messages get shown
-                curl -L https://gist.githubusercontent.com/inducer/17d7134ace215f0df1f3627eac4195c7/raw/63edfaf2ec8bf06987896569a4f24264df490e9e/firedrake-debug-patch.diff | patch -p1
+                curl -L https://gist.githubusercontent.com/inducer/17d7134ace215f0df1f3627eac4195c7/raw/ec5470a7d8587b6e1f336f3ef1d0ece5e26f236a/firedrake-debug-patch.diff | patch -p1
 
-                pytest --tb=native -rsxw --durations=10 -m 'not parallel' tests/multigrid/
+                pytest --tb=native -rsxw --durations=10 tests/firedrake/regression -k "poisson_strong or stokes_mini or dg_advection"
 
     validate_cff:
             name: Validate CITATION.cff
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 2c314752e..9cd13dd54 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -158,11 +158,10 @@ Ruff:
 
 Mypy:
   script: |
-    EXTRA_INSTALL="pybind11 numpy"
+    EXTRA_INSTALL="mypy pybind11 numpy types-colorama types-Pygments"
     curl -L -O https://tiker.net/ci-support-v0
     . ./ci-support-v0
     build_py_project_in_venv
-    python -m pip install mypy
     ./run-mypy.sh
   tags:
   - python3
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index a87cfef7d..000000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,23 +0,0 @@
-include test/*.py
-include test/*.f90
-recursive-include examples *.py *.cl *.floopy *.sh *.ipynb *.cpp *.loopy
-recursive-include contrib *.vim *.py
-
-include build-helpers/*.sh
-include build-helpers/*.spec
-
-include doc/*.rst
-include doc/Makefile
-include doc/*.py
-include doc/images/*.png
-include doc/_static/*.css
-include doc/_templates/*.html
-include doc/images/*.svg
-include doc/images/*.png
-
-include configure.py
-include Makefile.in
-include README.rst
-include LICENSE
-include CITATION.cff
-include requirements*.txt
diff --git a/contrib/c-integer-semantics.py b/contrib/c-integer-semantics.py
index 8556430d0..8b30415c2 100644
--- a/contrib/c-integer-semantics.py
+++ b/contrib/c-integer-semantics.py
@@ -95,8 +95,7 @@ def main():
         func.argtypes = [ctypes.c_longlong, ctypes.c_longlong]
         func.restype = ctypes.c_longlong
 
-    cdiv = int_exp.cdiv  # noqa
-    cmod = int_exp.cmod  # noqa
+    cmod = int_exp.cmod
     int_floor_div = int_exp.loopy_floor_div_int64
     int_floor_div_pos_b = int_exp.loopy_floor_div_pos_b_int64
     int_mod_pos_b = int_exp.loopy_mod_pos_b_int64
diff --git a/contrib/mem-pattern-explorer/pattern_vis.py b/contrib/mem-pattern-explorer/pattern_vis.py
index bbde23174..f285dbb88 100644
--- a/contrib/mem-pattern-explorer/pattern_vis.py
+++ b/contrib/mem-pattern-explorer/pattern_vis.py
@@ -27,7 +27,7 @@ def __init__(self, gsize, lsize, subgroup_size=32, decay_constant=0.75):
 
         self.arrays = []
 
-    def l(self, index):  # noqa: E741,E743
+    def l(self, index):  # noqa: E743
         subscript = [np.newaxis] * self.ind_length
         subscript[len(self.gsize) + index] = slice(None)
 
@@ -147,7 +147,7 @@ def get_plot_data(self):
                 div_ceil(nelements, self.elements_per_row),
                 self.elements_per_row,)
         shaped_array = np.zeros(
-                base_shape + (self.nattributes,),
+                (*base_shape, self.nattributes),
                 dtype=np.float32)
         shaped_array.reshape(-1, self.nattributes)[:nelements] = self.array
 
@@ -160,7 +160,7 @@ def get_plot_data(self):
         else:
             subgroup.fill(1)
 
-        rgb_array = np.zeros(base_shape + (3,))
+        rgb_array = np.zeros((*base_shape, 3))
         if 1:
             if len(self.ctx.gsize) > 1:
                 # g.0 -> red
diff --git a/doc/conf.py b/doc/conf.py
index c4a13c445..b23ce311b 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -2,7 +2,7 @@
 from urllib.request import urlopen
 
 
-_conf_url = "https://raw.githubusercontent.com/inducer/sphinxconfig/main/sphinxconfig.py"  # noqa
+_conf_url = "https://raw.githubusercontent.com/inducer/sphinxconfig/main/sphinxconfig.py"
 with urlopen(_conf_url) as _inf:
     exec(compile(_inf.read(), _conf_url, "exec"), globals())
 
@@ -35,20 +35,6 @@
         "pyrsistent": ("https://pyrsistent.readthedocs.io/en/latest/", None),
         }
 
-# Some modules need to import things just so that sphinx can resolve symbols in
-# type annotations. Often, we do not want these imports (e.g. of PyOpenCL) when
-# in normal use (because they would introduce unintended side effects or hard
-# dependencies). This flag exists so that these imports only occur during doc
-# build. Since sphinx appears to resolve type hints lexically (as it should),
-# this needs to be cross-module (since, e.g. an inherited arraycontext
-# docstring can be read by sphinx when building meshmode, a dependent package),
-# this needs a setting of the same name across all packages involved, that's
-# why this name is as global-sounding as it is.
-import sys
-
-
-sys._BUILDING_SPHINX_DOCS = True
-
 nitpicky = True
 
 nitpick_ignore_regex = [
@@ -62,13 +48,13 @@
         ["py:class", r"immutables\.(.+)"],
 
         # Reference not found from "<unknown>"? I'm not even sure where to look.
-        ["py:class", r"Expression"],
+        ["py:class", r"ExpressionNode"],
+
+        # Type aliases
+        ["py:class", r"InameStr"],
+        ["py:class", r"ConcreteCallablesTable"],
+        ["py:class", r"LoopNestTree"],
+        ["py:class", r"LoopTree"],
+        ["py:class", r"ToLoopyTypeConvertible"],
+        ["py:class", r"ToStackMatchConvertible"],
         ]
-
-autodoc_type_aliases = {
-    "ToLoopyTypeConvertible": "ToLoopyTypeConvertible",
-    "ExpressionT": "ExpressionT",
-    "InameStr": "InameStr",
-    "ShapeType": "ShapeType",
-    "StridesType": "StridesType",
-}
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 2962c23b8..227356b11 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -25,7 +25,7 @@ consist of arithmetic operations and calls to functions.
 If the outermost operation of the RHS expression is a function call,
 the RHS value may be a tuple, and multiple (still scalar) arrays appear
 as LHS values. (This is the only sense in which tuple types are supported.)
-Each statement is parametrized by zero or more loop variables ("inames").
+Each statement is parameterized by zero or more loop variables ("inames").
 A statement is executed once for each integer point defined by the domain
 forest for the iname tuple given for that statement
 (:attr:`loopy.InstructionBase.within_inames`). Each execution of a
@@ -656,8 +656,6 @@ Helper values
 
 .. {{{
 
-.. autoclass:: auto
-
 .. autoclass:: UniqueName
 
 .. autoclass:: Optional
@@ -693,11 +691,7 @@ The Kernel Object
 Do not create :class:`LoopKernel` objects directly. Instead, refer to
 :ref:`creating-kernels`.
 
-.. autoclass:: LoopKernel
-
-.. autoclass:: KernelState
-    :members:
-    :undoc-members:
+.. automodule:: loopy.kernel
 
 Implementation Details: The Base Array
 --------------------------------------
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 1dd43e7e2..8b531d23b 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1681,7 +1681,7 @@ Each line of output will look roughly like::
   data type accessed.
 
 - lid_strides: A :class:`dict` of **{** :class:`int` **:**
-  :class:`pymbolic.primitives.Expression` or :class:`int` **}** that specifies
+  :data:`~pymbolic.typing.Expression` or :class:`int` **}** that specifies
   local strides for each local id in the memory access index. Local ids not
   found will not be present in ``lid_strides.keys()``. Uniform access (i.e.
   work-items within a sub-group access the same item) is indicated by setting
@@ -1689,7 +1689,7 @@ Each line of output will look roughly like::
   which case the 0 key will not be present in lid_strides.
 
 - gid_strides: A :class:`dict` of **{** :class:`int` **:**
-  :class:`pymbolic.primitives.Expression` or :class:`int` **}** that specifies
+  :data:`~pymbolic.typing.Expression` or :class:`int` **}** that specifies
   global strides for each global id in the memory access index. Global ids not
   found will not be present in ``gid_strides.keys()``.
 
diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py
index bf6e29e47..ce61b16be 100644
--- a/examples/python/ispc-stream-harness.py
+++ b/examples/python/ispc-stream-harness.py
@@ -26,7 +26,7 @@ def transform(knl, vars, stream_dtype):
 
     knl = lp.add_and_infer_dtypes(knl, dict.fromkeys(vars, stream_dtype))
 
-    knl = lp.set_argument_order(knl, vars + ["n"])
+    knl = lp.set_argument_order(knl, [*vars, "n"])
 
     return knl
 
diff --git a/loopy/__init__.py b/loopy/__init__.py
index ef9868e6f..734528219 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -478,7 +481,7 @@ def register_preamble_generators(kernel: LoopKernel, preamble_generators):
                         "and would thus disrupt loopy's caches"
                         % pgen)
 
-            new_pgens = (pgen,) + new_pgens
+            new_pgens = (pgen, *new_pgens)
 
     return kernel.copy(preamble_generators=new_pgens)
 
@@ -496,7 +499,7 @@ def register_symbol_manglers(kernel, manglers):
                         "and would disrupt loopy's caches"
                         % m)
 
-            new_manglers = (m,) + new_manglers
+            new_manglers = (m, *new_manglers)
 
     return kernel.copy(symbol_manglers=new_manglers)
 
diff --git a/loopy/__main__.py b/loopy/__main__.py
index 630b93830..d8b61adc1 100644
--- a/loopy/__main__.py
+++ b/loopy/__main__.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import loopy.cli
 
 
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index 6ee762556..04a2b9239 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -21,7 +24,7 @@
 """
 
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional, Tuple
+from typing import TYPE_CHECKING
 from warnings import warn
 
 import numpy as np
@@ -77,26 +80,26 @@ def fill_rand(ary):
 @dataclass
 class TestArgInfo:
     name: str
-    ref_array: "cla.Array"
-    ref_storage_array: "cla.Array"
+    ref_array: cla.Array
+    ref_storage_array: cla.Array
 
-    ref_pre_run_array: "cla.Array"
-    ref_pre_run_storage_array: "cla.Array"
+    ref_pre_run_array: cla.Array
+    ref_pre_run_storage_array: cla.Array
 
-    ref_shape: Tuple[int, ...]
-    ref_strides: Tuple[int, ...]
+    ref_shape: tuple[int, ...]
+    ref_strides: tuple[int, ...]
     ref_alloc_size: int
-    ref_numpy_strides: Tuple[int, ...]
+    ref_numpy_strides: tuple[int, ...]
     needs_checking: bool
 
     # The attributes below are being modified in make_args, hence this dataclass
     # cannot be frozen.
-    test_storage_array: Optional["cla.Array"] = None
-    test_array: Optional["cla.Array"] = None
-    test_shape: Optional[Tuple[int, ...]] = None
-    test_strides: Optional[Tuple[int, ...]] = None
-    test_numpy_strides: Optional[Tuple[int, ...]] = None
-    test_alloc_size: Optional[Tuple[int, ...]] = None
+    test_storage_array: cla.Array | None = None
+    test_array: cla.Array | None = None
+    test_shape: tuple[int, ...] | None = None
+    test_strides: tuple[int, ...] | None = None
+    test_numpy_strides: tuple[int, ...] | None = None
+    test_alloc_size: tuple[int, ...] | None = None
 
 
 # {{{ "reference" arguments
@@ -410,12 +413,12 @@ def auto_test_vs_ref(
     if ref_entrypoint is None:
         if len(ref_prog.entrypoints) != 1:
             raise LoopyError("Unable to guess entrypoint for ref_prog.")
-        ref_entrypoint = list(ref_prog.entrypoints)[0]
+        ref_entrypoint = next(iter(ref_prog.entrypoints))
 
     if test_entrypoint is None:
         if len(test_prog.entrypoints) != 1:
             raise LoopyError("Unable to guess entrypoint for ref_prog.")
-        test_entrypoint = list(test_prog.entrypoints)[0]
+        test_entrypoint = next(iter(test_prog.entrypoints))
 
     ref_prog = lp.preprocess_kernel(ref_prog)
     test_prog = lp.preprocess_kernel(test_prog)
diff --git a/loopy/check.py b/loopy/check.py
index ee24d6e4b..1a63c90bc 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -22,9 +25,8 @@
 
 import logging
 from collections import defaultdict
-from collections.abc import Mapping, Sequence
 from functools import reduce
-from typing import List, Optional, Tuple, Union
+from typing import TYPE_CHECKING
 
 import numpy as np
 
@@ -39,7 +41,6 @@
     WriteRaceConditionWarning,
     warn_with_kernel,
 )
-from loopy.kernel import LoopKernel
 from loopy.kernel.array import (
     ArrayBase,
     FixedStrideArrayDimTag,
@@ -68,7 +69,15 @@
     check_each_kernel,
 )
 from loopy.type_inference import TypeReader
-from loopy.typing import ExpressionT, not_none
+from loopy.typing import not_none
+
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping, Sequence
+
+    from pymbolic.typing import Expression
+
+    from loopy.kernel import LoopKernel
 
 
 logger = logging.getLogger(__name__)
@@ -206,22 +215,22 @@ def check_separated_array_consistency(kernel: LoopKernel) -> None:
                 for attr_name in ["address_space", "is_input", "is_output"]:
                     if getattr(arg, attr_name) != getattr(sub_arg, attr_name):
                         raise LoopyError(
-                                "Attribute '{attr_name}' of "
+                                f"Attribute '{attr_name}' of "
                                 f"'{arg.name}' and associated sep array "
                                 f"'{sub_arg.name}' is not consistent.")
 
 
 @check_each_kernel
 def check_offsets_and_dim_tags(kernel: LoopKernel) -> None:
-    from pymbolic.primitives import Expression, Variable
+    from pymbolic.primitives import ExpressionNode, Variable
 
     from loopy.symbolic import DependencyMapper
 
     arg_name_vars = {Variable(name) for name in kernel.arg_dict}
-    dep_mapper = DependencyMapper()
+    dep_mapper: DependencyMapper[[]] = DependencyMapper()
 
     def ensure_depends_only_on_arguments(
-            what: str, expr: Union[str, ExpressionT]) -> None:
+            what: str, expr: str | Expression) -> None:
         if isinstance(expr, str):
             expr = Variable(expr)
 
@@ -241,14 +250,14 @@ def ensure_depends_only_on_arguments(
                 continue
             if arg.offset is auto:
                 pass
-            elif isinstance(arg.offset, (int, np.integer, Expression, str)):
+            elif isinstance(arg.offset, (int, np.integer, ExpressionNode, str)):
                 ensure_depends_only_on_arguments(what, arg.offset)
 
             else:
                 raise LoopyError(f"invalid value of offset for '{arg.name}'")
 
             if arg.dim_tags is None:
-                new_dim_tags: Optional[Tuple[ArrayDimImplementationTag, ...]] = \
+                new_dim_tags: tuple[ArrayDimImplementationTag, ...] | None = \
                         arg.dim_tags
             else:
                 new_dim_tags = ()
@@ -259,13 +268,13 @@ def ensure_depends_only_on_arguments(
                         if dim_tag.stride is auto:
                             pass
                         elif isinstance(
-                                dim_tag.stride, (int, np.integer, Expression)):
+                                dim_tag.stride, (int, np.integer, ExpressionNode)):
                             ensure_depends_only_on_arguments(what, dim_tag.stride)
                         else:
                             raise LoopyError(f"invalid value of {what}")
 
                     assert new_dim_tags is not None
-                    new_dim_tags = new_dim_tags + (dim_tag,)
+                    new_dim_tags = (*new_dim_tags, dim_tag)
 
             arg = arg.copy(dim_tags=new_dim_tags)
 
@@ -281,7 +290,7 @@ def ensure_depends_only_on_arguments(
             pass
         if tv.offset is auto:
             pass
-        elif isinstance(tv.offset, (int, np.integer, Expression, str)):
+        elif isinstance(tv.offset, (int, np.integer, ExpressionNode, str)):
             ensure_depends_only_on_arguments(what, tv.offset)
         else:
             raise LoopyError(f"invalid value of offset for '{tv.name}'")
@@ -294,7 +303,7 @@ def ensure_depends_only_on_arguments(
                     if dim_tag.stride is auto:
                         raise LoopyError(f"The {what}" f" is 'auto', "
                                 "which is not allowed.")
-                    elif isinstance(dim_tag.stride, (int, np.integer, Expression)):
+                    elif isinstance(dim_tag.stride, (int, np.integer, ExpressionNode)):
                         ensure_depends_only_on_arguments(what, dim_tag.stride)
                     else:
                         raise LoopyError(f"invalid value of {what}")
@@ -1323,7 +1332,7 @@ def check_for_nested_base_storage(kernel: LoopKernel) -> None:
     # must run after preprocessing has created variables for base_storage
 
     from loopy.kernel.data import ArrayArg
-    arrays: List[ArrayBase] = [
+    arrays: list[ArrayBase] = [
         arg for arg in kernel.args if isinstance(arg, ArrayArg)
         ]
     arrays = arrays + list(kernel.temporary_variables.values())
diff --git a/loopy/cli.py b/loopy/cli.py
index 69c35fcea..4841f8e9f 100644
--- a/loopy/cli.py
+++ b/loopy/cli.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import sys
 
 import numpy as np
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 2e39d89bd..3c3b42f34 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -21,59 +24,46 @@
 """
 
 import logging
-import sys
 from dataclasses import dataclass, replace
 from typing import (
     TYPE_CHECKING,
     Any,
-    FrozenSet,
     Mapping,
-    Optional,
     Sequence,
-    Set,
-    Tuple,
-    Union,
 )
 
-from immutables import Map
-
-from loopy.codegen.result import CodeGenerationResult
-from loopy.library.reduction import ReductionOpFunction
-from loopy.translation_unit import CallablesTable, TranslationUnit
+import immutables
 
 
 logger = logging.getLogger(__name__)
 
 from functools import reduce
 
+import islpy  # to help out Sphinx
 import islpy as isl
-from pytools import ProcessLogger, UniqueNameGenerator
+import pytools  # to help out Sphinx
+from pytools import ProcessLogger
 from pytools.persistent_dict import WriteOncePersistentDict
 
 from loopy.diagnostic import LoopyError, warn
-from loopy.kernel import LoopKernel
 from loopy.kernel.function_interface import CallableKernel
 from loopy.symbolic import CombineMapper
-from loopy.target import TargetBase
 from loopy.tools import LoopyKeyBuilder, caches
-from loopy.types import LoopyType
-from loopy.typing import ExpressionT
 from loopy.version import DATA_MODEL_VERSION
 
 
 if TYPE_CHECKING:
-    from loopy.codegen.result import GeneratedProgram
+    from loopy.codegen.result import CodeGenerationResult, GeneratedProgram
     from loopy.codegen.tools import CodegenOperationCacheManager
-
-
-if getattr(sys, "_BUILDING_SPHINX_DOCS", False):
-    from loopy.codegen.result import GeneratedProgram  # noqa: F811
-    from loopy.codegen.tools import CodegenOperationCacheManager  # noqa: F811
+    from loopy.kernel import LoopKernel
+    from loopy.library.reduction import ReductionOpFunction
+    from loopy.target import TargetBase
+    from loopy.translation_unit import CallablesTable, TranslationUnit
+    from loopy.types import LoopyType
+    from loopy.typing import Expression
 
 
 __doc__ = """
-.. currentmodule:: loopy.codegen
-
 .. autoclass:: PreambleInfo
 
 .. autoclass:: VectorizationInfo
@@ -90,9 +80,9 @@
 
 References
 ^^^^^^^^^^
-.. class:: Expression
+.. class:: ExpressionNode
 
-    See :class:`pymbolic.Expression`.
+    See :class:`pymbolic.primitives.ExpressionNode`.
 """
 
 
@@ -112,8 +102,6 @@ class VectorizationInfo:
 
     iname: str
     length: int
-    # FIXME why is this here?
-    space: isl.Space
 
 
 @dataclass(frozen=True)
@@ -134,108 +122,76 @@ class SeenFunction:
     """
     name: str
     c_name: str
-    arg_dtypes: Tuple[LoopyType, ...]
-    result_dtypes: Tuple[LoopyType, ...]
+    arg_dtypes: tuple[LoopyType, ...]
+    result_dtypes: tuple[LoopyType, ...]
 
 
 @dataclass(frozen=True)
 class CodeGenerationState:
     """
-    .. attribute:: kernel
-    .. attribute:: target
-    .. attribute:: implemented_domain
-
-        The entire implemented domain (as an :class:`islpy.Set`)
-        i.e. all constraints that have been enforced so far.
-
-    .. attribute:: implemented_predicates
-
-        A :class:`frozenset` of predicates for which checks have been
-        implemented.
-
-    .. attribute:: seen_dtypes
-
-        set of dtypes that were encountered
-
-    .. attribute:: seen_functions
-
-        set of :class:`SeenFunction` instances
+    .. autoattribute:: kernel
+    .. autoattribute:: target
+    .. autoattribute:: implemented_domain
+    .. autoattribute:: implemented_predicates
 
+    .. autoattribute:: seen_dtypes
+    .. autoattribute:: seen_functions
     .. attribute:: seen_atomic_dtypes
 
-    .. attribute:: var_subst_map
-
-    .. attribute:: allow_complex
-
-    .. attribute:: vectorization_info
-
-        *None* (to mean vectorization has not yet been applied),  or an instance of
-        :class:`VectorizationInfo`.
-
-    .. attribute:: is_generating_device_code
+    .. autoattribute:: var_subst_map
 
-    .. attribute:: gen_program_name
-
-        None (indicating that host code is being generated)
-        or the name of the device program currently being
-        generated.
-
-    .. attribute:: schedule_index_end
-
-    .. attribute:: callables_table
-
-        A mapping from callable names to instances of
-        :class:`loopy.kernel.function_interface.InKernelCallable`.
-
-    .. attribute:: is_entrypoint
+    .. autoattribute:: allow_complex
+    .. autoattribute:: vectorization_info
+    .. autoattribute:: is_generating_device_code
+    .. autoattribute:: gen_program_name
+    .. autoattribute:: schedule_index_end
+    .. autoattribute:: callables_table
+    .. autoattribute:: is_entrypoint
+    .. autoattribute:: codegen_cache_manager
+    """
 
-        A :class:`bool` to indicate if the code is being generated for an
-        entrypoint kernel
+    kernel: LoopKernel
 
-    .. attribute:: codegen_cache_manager
+    # LoopKernel should not have a target, should use this instead
+    target: TargetBase
 
-        An instance of :class:`loopy.codegen.tools.CodegenOperationCacheManager`.
+    implemented_domain: islpy.Set
+    """
+    The entire implemented domain (as an :class:`islpy.Set`)
+    i.e. all constraints that have been enforced so far.
     """
 
-    kernel: LoopKernel
-    target: TargetBase
-    implemented_domain: isl.Set
-    implemented_predicates: FrozenSet[Union[str, ExpressionT]]
+    implemented_predicates: frozenset[str | Expression]
 
     # /!\ mutable
-    seen_dtypes: Set[LoopyType]
-    seen_functions: Set[SeenFunction]
-    seen_atomic_dtypes: Set[LoopyType]
+    seen_dtypes: set[LoopyType]
+    seen_functions: set[SeenFunction]
+    seen_atomic_dtypes: set[LoopyType]
 
-    var_subst_map: Map[str, ExpressionT]
+    var_subst_map: immutables.Map[str, Expression]
     allow_complex: bool
     callables_table: CallablesTable
     is_entrypoint: bool
-    var_name_generator: UniqueNameGenerator
+    var_name_generator: pytools.UniqueNameGenerator
     is_generating_device_code: bool
-    gen_program_name: str
-    schedule_index_end: int
-    codegen_cachemanager: "CodegenOperationCacheManager"
-    vectorization_info: Optional[VectorizationInfo] = None
 
-    def __post_init__(self):
-        # FIXME: If this doesn't bomb during testing, we can get rid of target.
-        assert self.target == self.kernel.target
+    gen_program_name: str
 
-        assert self.vectorization_info is None or isinstance(
-                self.vectorization_info, VectorizationInfo)
+    schedule_index_end: int
+    codegen_cache_manager: CodegenOperationCacheManager
+    vectorization_info: VectorizationInfo | None = None
 
     # {{{ copy helpers
 
-    def copy(self, **kwargs: Any) -> "CodeGenerationState":
+    def copy(self, **kwargs: Any) -> CodeGenerationState:
         return replace(self, **kwargs)
 
     def copy_and_assign(
-            self, name: str, value: ExpressionT) -> "CodeGenerationState":
+            self, name: str, value: Expression) -> CodeGenerationState:
         """Make a copy of self with variable *name* fixed to *value*."""
         return self.copy(var_subst_map=self.var_subst_map.set(name, value))
 
-    def copy_and_assign_many(self, assignments) -> "CodeGenerationState":
+    def copy_and_assign_many(self, assignments) -> CodeGenerationState:
         """Make a copy of self with *assignments* included."""
 
         return self.copy(var_subst_map=self.var_subst_map.update(assignments))
@@ -305,7 +261,8 @@ def unvectorize(self, func):
         novec_self = self.copy(vectorization_info=None)
 
         for i in range(vinf.length):
-            idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i
+            idx_aff = isl.Aff.zero_on_domain(
+                        isl.Space.params_alloc(self.kernel.isl_context, 0)) + i
             new_codegen_state = novec_self.fix(vinf.iname, idx_aff)
             generated = func(new_codegen_state)
 
@@ -367,10 +324,16 @@ def map_constant(self, expr):
 
 @dataclass(frozen=True)
 class PreambleInfo:
+    """
+    .. autoattribute:: kernel
+    .. autoattribute:: seen_dtypes
+    .. autoattribute:: seen_functions
+    .. autoattribute:: seen_atomic_dtypes
+    """
     kernel: LoopKernel
-    seen_dtypes: Set[LoopyType]
-    seen_functions: Set[SeenFunction]
-    seen_atomic_dtypes: Set[LoopyType]
+    seen_dtypes: set[LoopyType]
+    seen_functions: set[SeenFunction]
+    seen_atomic_dtypes: set[LoopyType]
 
     # FIXME: This makes all the above redundant. It probably shouldn't be here.
     codegen_state: CodeGenerationState
@@ -418,7 +381,7 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target,
             seen_dtypes=seen_dtypes,
             seen_functions=seen_functions,
             seen_atomic_dtypes=seen_atomic_dtypes,
-            var_subst_map=Map(),
+            var_subst_map=immutables.Map(),
             allow_complex=allow_complex,
             var_name_generator=kernel.get_var_name_generator(),
             is_generating_device_code=False,
@@ -429,7 +392,7 @@ def generate_code_for_a_single_kernel(kernel, callables_table, target,
             schedule_index_end=len(kernel.linearization),
             callables_table=callables_table,
             is_entrypoint=is_entrypoint,
-            codegen_cachemanager=CodegenOperationCacheManager.from_kernel(kernel),
+            codegen_cache_manager=CodegenOperationCacheManager.from_kernel(kernel),
             )
 
     from loopy.codegen.result import generate_host_or_device_program
@@ -519,7 +482,7 @@ def diverge_callee_entrypoints(program):
 
         new_callables[name] = clbl
 
-    return program.copy(callables_table=Map(new_callables))
+    return program.copy(callables_table=immutables.Map(new_callables))
 
 
 @dataclass(frozen=True)
@@ -543,10 +506,10 @@ class TranslationUnitCodeGenerationResult:
     .. automethod:: all_code
 
     """
-    host_programs: Mapping[str, "GeneratedProgram"]
-    device_programs: Sequence["GeneratedProgram"]
-    host_preambles: Sequence[Tuple[int, str]] = ()
-    device_preambles: Sequence[Tuple[int, str]] = ()
+    host_programs: Mapping[str, GeneratedProgram]
+    device_programs: Sequence[GeneratedProgram]
+    host_preambles: Sequence[tuple[int, str]] = ()
+    device_preambles: Sequence[tuple[int, str]] = ()
 
     def host_code(self):
         from loopy.codegen.result import process_preambles
@@ -666,7 +629,7 @@ def generate_code_v2(t_unit: TranslationUnit) -> CodeGenerationResult:
     # adding the callee fdecls to the device_programs
     device_programs = ([device_programs[0].copy(
             ast=t_unit.target.get_device_ast_builder().ast_module.Collection(
-                callee_fdecls+[device_programs[0].ast]))] +
+                [*callee_fdecls, device_programs[0].ast]))] +
             device_programs[1:])
 
     def not_reduction_op(name: str | ReductionOpFunction) -> str:
diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py
index a066d3425..0f3bdba41 100644
--- a/loopy/codegen/bounds.py
+++ b/loopy/codegen/bounds.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -21,13 +24,15 @@
 """
 
 
-from typing import FrozenSet
+from typing import TYPE_CHECKING
 
 import islpy as isl
 from islpy import dim_type
 
-from loopy.codegen.tools import CodegenOperationCacheManager
-from loopy.kernel import LoopKernel
+
+if TYPE_CHECKING:
+    from loopy.codegen.tools import CodegenOperationCacheManager
+    from loopy.kernel import LoopKernel
 
 
 # {{{ approximate, convex bounds check generator
@@ -62,7 +67,7 @@ def get_approximate_convex_bounds_checks(domain, check_inames,
 
 def get_usable_inames_for_conditional(
         kernel: LoopKernel, sched_index: int,
-        op_cache_manager: CodegenOperationCacheManager) -> FrozenSet[str]:
+        op_cache_manager: CodegenOperationCacheManager) -> frozenset[str]:
     active_inames = op_cache_manager.active_inames[sched_index]
     crosses_barrier = op_cache_manager.has_barrier_within[sched_index]
 
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index bee09229f..fd38c97e7 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -1,4 +1,5 @@
 """Loop nest build top-level control/hoisting."""
+from __future__ import annotations
 
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
@@ -273,7 +274,7 @@ class ScheduleIndexInfo(ImmutableRecord):
                 schedule_indices=[i],
                 admissible_cond_inames=(
                     get_usable_inames_for_conditional(kernel, i,
-                        codegen_state.codegen_cachemanager)),
+                        codegen_state.codegen_cache_manager)),
                 required_predicates=get_required_predicates(kernel, i),
                 used_inames_within=find_used_inames_within(kernel, i)
                 )
@@ -470,7 +471,7 @@ def gen_code(inner_codegen_state):
 
                 prev_gen_code = gen_code
 
-                def gen_code(inner_codegen_state):  # noqa pylint:disable=function-redefined
+                def gen_code(inner_codegen_state):  # pylint: disable=function-redefined
                     condition_exprs = [
                             constraint_to_cond_expr(cns)
                             for cns in bounds_checks] + list(pred_checks)
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index 1bc26733e..3b0195507 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -1,4 +1,5 @@
 """Code generation for Instruction objects."""
+from __future__ import annotations
 
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
@@ -191,7 +192,7 @@ def generate_assignment_instruction_code(codegen_state, insn):
         from pymbolic.mapper.stringifier import PREC_NONE
         lhs_code = codegen_state.expression_to_code_mapper(insn.assignee, PREC_NONE)
 
-        from cgen import Statement as S  # noqa
+        from cgen import Statement as S
 
         gs, ls = kernel.get_grid_size_upper_bounds(codegen_state.callables_table)
 
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index c64c2ea67..44bfa07cc 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -210,7 +213,7 @@ def generate_vectorize_loop(codegen_state, sched_index):
             vectorization_info=VectorizationInfo(
                 iname=iname,
                 length=length,
-                space=length_aff.space))
+                ))
 
     return build_loop_nest(new_codegen_state, sched_index+1)
 
@@ -360,7 +363,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index, hints):
 
     # Note: this does not include loop_iname itself!
     usable_inames = get_usable_inames_for_conditional(kernel, sched_index,
-            codegen_state.codegen_cachemanager)
+            codegen_state.codegen_cache_manager)
 
     domain = kernel.get_inames_domain(loop_iname)
 
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 7fcb4294a..02b5ce27a 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
 
 __license__ = """
@@ -24,23 +27,18 @@
 from typing import (
     TYPE_CHECKING,
     Any,
-    Dict,
-    List,
     Mapping,
-    Optional,
     Sequence,
-    Tuple,
-    Union,
 )
 
-import islpy as isl
-
 
 if TYPE_CHECKING:
+    import islpy
+
     from loopy.codegen import CodeGenerationState
 
 
-def process_preambles(preambles: Sequence[Tuple[int, str]]) -> Sequence[str]:
+def process_preambles(preambles: Sequence[tuple[int, str]]) -> Sequence[str]:
     seen_preamble_tags = set()
     dedup_preambles = []
 
@@ -60,8 +58,6 @@ def process_preambles(preambles: Sequence[Tuple[int, str]]) -> Sequence[str]:
 __doc__ = """
 .. currentmodule:: loopy.codegen.result
 
-.. autoclass:: GeneratedProgram
-
 .. autoclass:: CodeGenerationResult
 
 .. autofunction:: merge_codegen_results
@@ -94,9 +90,9 @@ class GeneratedProgram:
     name: str
     is_device_program: bool
     ast: Any
-    body_ast: Optional[Any] = None
+    body_ast: Any | None = None
 
-    def copy(self, **kwargs: Any) -> "GeneratedProgram":
+    def copy(self, **kwargs: Any) -> GeneratedProgram:
         return replace(self, **kwargs)
 
 
@@ -121,13 +117,13 @@ class CodeGenerationResult:
     .. automethod:: device_code
     .. automethod:: all_code
     """
-    host_program: Optional[GeneratedProgram]
+    host_program: GeneratedProgram | None
     device_programs: Sequence[GeneratedProgram]
-    implemented_domains: Mapping[str, isl.Set]
-    host_preambles: Sequence[Tuple[str, str]] = ()
-    device_preambles: Sequence[Tuple[str, str]] = ()
+    implemented_domains: Mapping[str, islpy.Set]
+    host_preambles: Sequence[tuple[str, str]] = ()
+    device_preambles: Sequence[tuple[str, str]] = ()
 
-    def copy(self, **kwargs: Any) -> "CodeGenerationResult":
+    def copy(self, **kwargs: Any) -> CodeGenerationResult:
         return replace(self, **kwargs)
 
     @staticmethod
@@ -185,7 +181,7 @@ def all_code(self):
                 + str(self.host_program.ast))
 
     def current_program(
-            self, codegen_state: "CodeGenerationState") -> GeneratedProgram:
+            self, codegen_state: CodeGenerationState) -> GeneratedProgram:
         if codegen_state.is_generating_device_code:
             if self.device_programs:
                 result = self.device_programs[-1]
@@ -210,9 +206,7 @@ def with_new_program(self, codegen_state, program):
             assert program.is_device_program
             return self.copy(
                     device_programs=(
-                        list(self.device_programs[:-1])
-                        +
-                        [program]))
+                        [*list(self.device_programs[:-1]), program]))
         else:
             assert program.name == codegen_state.gen_program_name
             assert not program.is_device_program
@@ -233,8 +227,8 @@ def with_new_ast(self, codegen_state, new_ast):
 # {{{ support code for AST merging
 
 def merge_codegen_results(
-        codegen_state: "CodeGenerationState",
-        elements: Sequence[Union[CodeGenerationResult, Any]], collapse=True
+        codegen_state: CodeGenerationState,
+        elements: Sequence[CodeGenerationResult | Any], collapse=True
         ) -> CodeGenerationResult:
     elements = [el for el in elements if el is not None]
 
@@ -251,9 +245,9 @@ def merge_codegen_results(
 
     ast_els = []
     new_device_programs = []
-    new_device_preambles: List[Tuple[str, str]] = []
+    new_device_preambles: list[tuple[str, str]] = []
     dev_program_names = set()
-    implemented_domains: Dict[str, isl.Set] = {}
+    implemented_domains: dict[str, islpy.Set] = {}
     codegen_result = None
 
     block_cls = codegen_state.ast_builder.ast_block_class
diff --git a/loopy/codegen/tools.py b/loopy/codegen/tools.py
index cb6285b08..be3d6ade5 100644
--- a/loopy/codegen/tools.py
+++ b/loopy/codegen/tools.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2020 Kaushik Kulkarni"
 
 __license__ = """
@@ -22,13 +25,11 @@
 
 from dataclasses import dataclass
 from functools import cached_property
-from typing import Dict, FrozenSet, List
+from typing import TYPE_CHECKING
 
 from pytools import memoize_method
 
 from loopy.kernel import LoopKernel
-from loopy.kernel.data import Iname
-from loopy.kernel.instruction import InstructionBase
 from loopy.schedule import (
     Barrier,
     BeginBlockItem,
@@ -41,9 +42,12 @@
 )
 
 
-__doc__ = """
-.. currentmodule:: loopy.codegen.tools
+if TYPE_CHECKING:
+    import loopy.kernel.data
+    from loopy.kernel.instruction import InstructionBase
 
+
+__doc__ = """
 .. autoclass:: KernelProxyForCodegenOperationCacheManager
 
 .. autoclass:: CodegenOperationCacheManager
@@ -56,9 +60,9 @@ class KernelProxyForCodegenOperationCacheManager:
     Proxy to :class:`loopy.LoopKernel` to be used by
     :class:`CodegenOperationCacheManager`.
     """
-    instructions: List[InstructionBase]
-    linearization: List[ScheduleItem]
-    inames: Dict[str, Iname]
+    instructions: list[InstructionBase]
+    linearization: list[ScheduleItem]
+    inames: dict[str, loopy.kernel.data.Iname]
 
     @cached_property
     def id_to_insn(self):
@@ -208,7 +212,7 @@ def get_insn_ids_for_block_at(self, sched_index):
 
     @memoize_method
     def get_concurrent_inames_in_a_callkernel(
-            self, callkernel_index: int) -> FrozenSet[str]:
+            self, callkernel_index: int) -> frozenset[str]:
         """
         Returns a :class:`frozenset` of concurrent inames in a callkernel
 
diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py
index 39e2fa591..be281158b 100644
--- a/loopy/diagnostic.py
+++ b/loopy/diagnostic.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
diff --git a/loopy/expression.py b/loopy/expression.py
index 2581ec022..e3eb65dc5 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012-15 Andreas Kloeckner"
 
 __license__ = """
diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py
index 5e6ff24d0..7c184f73c 100644
--- a/loopy/frontend/fortran/__init__.py
+++ b/loopy/frontend/fortran/__init__.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
 
 __license__ = """
@@ -225,7 +228,7 @@ def parse_transformed_fortran(source, free_form=True, strict=True,
     prev_sys_path = sys.path
     try:
         if infile_dirname:
-            sys.path = prev_sys_path + [infile_dirname]
+            sys.path = [*prev_sys_path, infile_dirname]
 
         if pre_transform_code is not None:
             proc_dict["_MODULE_SOURCE_CODE"] = pre_transform_code
diff --git a/loopy/frontend/fortran/diagnostic.py b/loopy/frontend/fortran/diagnostic.py
index b2ea02c05..5d3df2a21 100644
--- a/loopy/frontend/fortran/diagnostic.py
+++ b/loopy/frontend/fortran/diagnostic.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
diff --git a/loopy/frontend/fortran/expression.py b/loopy/frontend/fortran/expression.py
index bb8394515..54bca20bb 100644
--- a/loopy/frontend/fortran/expression.py
+++ b/loopy/frontend/fortran/expression.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
 
 __license__ = """
@@ -22,6 +25,7 @@
 
 import re
 from sys import intern
+from typing import TYPE_CHECKING, ClassVar
 
 import numpy as np
 
@@ -31,6 +35,12 @@
 from loopy.frontend.fortran.diagnostic import TranslationError
 
 
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+    from loopy.symbolic import LexTable
+
+
 _less_than = intern("less_than")
 _greater_than = intern("greater_than")
 _less_equal = intern("less_equal")
@@ -65,7 +75,7 @@ def tuple_to_complex_literal(expr):
 # {{{ expression parser
 
 class FortranExpressionParser(ExpressionParserBase):
-    lex_table = [
+    lex_table: ClassVar[LexTable] = [
         (_less_than, pytools.lex.RE(r"\.lt\.", re.I)),
         (_greater_than, pytools.lex.RE(r"\.gt\.", re.I)),
         (_less_equal, pytools.lex.RE(r"\.le\.", re.I)),
@@ -142,7 +152,7 @@ def parse_terminal(self, pstate):
             return ExpressionParserBase.parse_terminal(
                     self, pstate)
 
-    COMP_MAP = {
+    COMP_MAP: ClassVar[Mapping[str, str]] = {
             _less_than: "<",
             _less_equal: "<=",
             _greater_than: ">",
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index fc9eace87..5000abf84 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
 
 __license__ = """
@@ -22,6 +25,7 @@
 
 import re
 from sys import intern
+from typing import ClassVar
 from warnings import warn
 
 import numpy as np
@@ -53,7 +57,7 @@ def __init__(self, scope):
         super().__init__()
 
     def get_cache_key(self, expr):
-        return super().get_cache_key(expr) + (self.scope,)
+        return (*super().get_cache_key(expr), self.scope)
 
     def map_subscript(self, expr):
         from pymbolic.primitives import Variable
@@ -441,7 +445,7 @@ def map_Implicit(self, node):
     def map_Equivalence(self, node):
         raise NotImplementedError("equivalence")
 
-    TYPE_MAP = {
+    TYPE_MAP: ClassVar[dict[tuple[str, str], type[np.generic]]] = {
             ("real", ""): np.float32,
             ("real", "4"): np.float32,
             ("real", "8"): np.float64,
@@ -455,9 +459,9 @@ def map_Equivalence(self, node):
             ("integer", "8"): np.int64,
             }
     if hasattr(np, "float128"):
-        TYPE_MAP[("real", "16")] = np.float128  # pylint:disable=no-member
+        TYPE_MAP["real", "16"] = np.float128  # pylint:disable=no-member
     if hasattr(np, "complex256"):
-        TYPE_MAP[("complex", "32")] = np.complex256  # pylint:disable=no-member
+        TYPE_MAP["complex", "32"] = np.complex256  # pylint:disable=no-member
 
     def dtype_from_stmt(self, stmt):
         length, kind = stmt.selector
@@ -471,7 +475,7 @@ def dtype_from_stmt(self, stmt):
         else:
             raise RuntimeError("both length and kind specified")
 
-        return np.dtype(self.TYPE_MAP[(type(stmt).__name__.lower(), length)])
+        return np.dtype(self.TYPE_MAP[type(stmt).__name__.lower(), length])
 
     def map_type_decl(self, node):
         scope = self.scope_stack[-1]
diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py
index b2af66f08..f1613b22f 100644
--- a/loopy/frontend/fortran/tree.py
+++ b/loopy/frontend/fortran/tree.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
 __license__ = """
@@ -62,7 +65,7 @@ def rec(self, expr, *args, **kwargs):
             r"^(?P<name>[_0-9a-zA-Z]+)\s*"
             r"(\((?P<shape>[-+*/0-9:a-zA-Z, \t]+)\))?"
             r"(\s*=\s*(?P<initializer>.+))?"
-            "$")
+            r"$")
 
     def parse_dimension_specs(self, node, dim_decls):
         def parse_bounds(bounds_str):
diff --git a/loopy/ipython_ext.py b/loopy/ipython_ext.py
index ba734d8ba..92592bdba 100644
--- a/loopy/ipython_ext.py
+++ b/loopy/ipython_ext.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from IPython.core.magic import Magics, cell_magic, magics_class
 
 import loopy as lp
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 28aa3be30..04d0bcd98 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -1,4 +1,6 @@
 """isl helpers"""
+from __future__ import annotations
+
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -93,12 +95,12 @@ def make_slab(space, iname, start, stop, iname_multiplier=1):
 
     space = zero.get_domain_space()
 
-    from pymbolic.primitives import Expression
+    from pymbolic.primitives import ExpressionNode
 
     from loopy.symbolic import aff_from_expr
-    if isinstance(start, Expression):
+    if isinstance(start, ExpressionNode):
         start = aff_from_expr(space, start)
-    if isinstance(stop, Expression):
+    if isinstance(stop, ExpressionNode):
         stop = aff_from_expr(space, stop)
 
     if isinstance(start, int):
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 967640260..d612b5db3 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1,4 +1,14 @@
-"""Kernel object."""
+"""
+.. currentmodule:: loopy
+
+.. autoclass:: LoopKernel
+
+.. autoclass:: KernelState
+    :members:
+    :undoc-members:
+"""
+from __future__ import annotations
+
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -21,7 +31,6 @@
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
-
 from collections import defaultdict
 from dataclasses import dataclass, field, fields, replace
 from enum import IntEnum
@@ -31,25 +40,19 @@
     TYPE_CHECKING,
     Any,
     Callable,
-    Dict,
-    FrozenSet,
+    ClassVar,
     Iterator,
-    List,
     Mapping,
-    Optional,
     Sequence,
-    Set,
-    Tuple,
-    Union,
 )
 from warnings import warn
 
 import numpy as np
 from immutables import Map
 
+import islpy  # to help out Sphinx
 import islpy as isl
 from islpy import dim_type
-from pymbolic import ArithmeticExpressionT
 from pytools import (
     UniqueNameGenerator,
     generate_unique_names,
@@ -58,10 +61,11 @@
 )
 from pytools.tag import Tag, Taggable
 
+import loopy.codegen
+import loopy.kernel.data  # to help out Sphinx
 from loopy.diagnostic import CannotBranchDomainTree, LoopyError, StaticValueFindingError
 from loopy.kernel.data import (
     ArrayArg,
-    Iname,
     KernelArgument,
     SubstitutionRule,
     TemporaryVariable,
@@ -69,23 +73,24 @@
     _ArraySeparationInfo,
     filter_iname_tags_by_type,
 )
-from loopy.kernel.instruction import InstructionBase
-from loopy.options import Options
-from loopy.schedule import ScheduleItem
-from loopy.target import TargetBase
 from loopy.tools import update_persistent_hash
 from loopy.types import LoopyType, NumpyType
-from loopy.typing import ExpressionT, InameStr
 
 
 if TYPE_CHECKING:
-    from loopy.codegen import PreambleInfo
+    from pymbolic import ArithmeticExpression
+
     from loopy.kernel.function_interface import InKernelCallable
+    from loopy.kernel.instruction import InstructionBase
+    from loopy.options import Options
+    from loopy.schedule import ScheduleItem
+    from loopy.target import TargetBase
+    from loopy.typing import Expression, InameStr
 
 
 # {{{ loop kernel object
 
-class KernelState(IntEnum):  # noqa
+class KernelState(IntEnum):
     INITIAL = 0
     CALLS_RESOLVED = 1
     PREPROCESSED = 2
@@ -99,12 +104,9 @@ def _get_inames_from_domains(domains):
 
 @dataclass(frozen=True)
 class _BoundsRecord:
-    lower_bound_pw_aff: isl.PwAff
-    upper_bound_pw_aff: isl.PwAff
-    size: isl.PwAff
-
-
-PreambleGenerator = Callable[["PreambleInfo"], Iterator[Tuple[int, str]]]
+    lower_bound_pw_aff: islpy.PwAff
+    upper_bound_pw_aff: islpy.PwAff
+    size: islpy.PwAff
 
 
 @dataclass(frozen=True)
@@ -144,7 +146,7 @@ class LoopKernel(Taggable):
     .. automethod:: tagged
     .. automethod:: without_tags
     """
-    domains: Sequence[isl.BasicSet]
+    domains: Sequence[islpy.BasicSet]
     """Represents the :ref:`domain-tree`."""
 
     instructions: Sequence[InstructionBase]
@@ -153,13 +155,13 @@ class LoopKernel(Taggable):
     """
 
     args: Sequence[KernelArgument]
-    assumptions: isl.BasicSet
+    assumptions: islpy.BasicSet
     """
     Must be a :class:`islpy.BasicSet` parameter domain.
     """
 
     temporary_variables: Mapping[str, TemporaryVariable]
-    inames: Mapping[InameStr, Iname]
+    inames: Mapping[InameStr, loopy.kernel.data.Iname]
     """
     An entry is guaranteed to be present for each iname.
     """
@@ -167,16 +169,20 @@ class LoopKernel(Taggable):
     substitutions: Mapping[str, SubstitutionRule]
     options: Options
     target: TargetBase
-    tags: FrozenSet[Tag]
+    tags: frozenset[Tag]
     state: KernelState = KernelState.INITIAL
     name: str = "loopy_kernel"
 
-    preambles: Sequence[Tuple[int, str]] = ()
-    preamble_generators: Sequence[PreambleGenerator] = ()
+    preambles: Sequence[tuple[int, str]] = ()
+    preamble_generators: Sequence[
+        Callable[
+                [loopy.codegen.PreambleInfo],
+                Iterator[tuple[int, str]]]
+            ] = ()
     symbol_manglers: Sequence[
-            Callable[["LoopKernel", str], Optional[Tuple[LoopyType, str]]]] = ()
-    linearization: Optional[Sequence[ScheduleItem]] = None
-    iname_slab_increments: Mapping[InameStr, Tuple[int, int]] = field(
+            Callable[[LoopKernel, str], tuple[LoopyType, str] | None]] = ()
+    linearization: Sequence[ScheduleItem] | None = None
+    iname_slab_increments: Mapping[InameStr, tuple[int, int]] = field(
             default_factory=Map)
     """
     A mapping from inames to (lower_incr,
@@ -184,7 +190,7 @@ class LoopKernel(Taggable):
     'bulk' slabs with fewer conditionals.
     """
 
-    loop_priority: FrozenSet[Tuple[InameStr, ...]] = field(
+    loop_priority: frozenset[tuple[InameStr, ...]] = field(
             default_factory=frozenset)
     """
     A frozenset of priority constraints to the kernel. Each such constraint
@@ -193,22 +199,20 @@ class LoopKernel(Taggable):
     with non-parallel implementation tags.
     """
 
-    applied_iname_rewrites: Tuple[Dict[InameStr, ExpressionT], ...] = ()
+    applied_iname_rewrites: tuple[dict[InameStr, Expression], ...] = ()
     """
     A list of past substitution dictionaries that
     were applied to the kernel. These are stored so that they may be repeated
     on expressions the user specifies later.
     """
-    index_dtype: NumpyType = NumpyType(np.dtype(np.int32))
-    silenced_warnings: FrozenSet[str] = frozenset()
+    index_dtype: NumpyType = NumpyType(np.dtype(np.int32))  # noqa: RUF009
+    silenced_warnings: frozenset[str] = frozenset()
 
     # FIXME Yuck, this should go.
-    overridden_get_grid_sizes_for_insn_ids: Optional[
-            Callable[
-                [FrozenSet[str],
-                    Dict[str, "InKernelCallable"],
-                    bool],
-                Tuple[Tuple[int, ...], Tuple[int, ...]]]] = None
+    overridden_get_grid_sizes_for_insn_ids: \
+        Callable[[frozenset[str], dict[str, InKernelCallable], bool],
+            tuple[tuple[int, ...], tuple[int, ...]]
+        ] | None = None
 
     def __post_init__(self):
         assert isinstance(self.assumptions, isl.BasicSet)
@@ -281,7 +285,7 @@ def get_group_name_generator(self):
         return UniqueNameGenerator(set(self.all_group_names()))
 
     def get_var_descriptor(
-            self, name: str) -> Union[TemporaryVariable, KernelArgument]:
+            self, name: str) -> TemporaryVariable | KernelArgument:
         try:
             return self.arg_dict[name]
         except KeyError:
@@ -317,7 +321,7 @@ def id_to_insn(self):
     # {{{ domain wrangling
 
     @memoize_method
-    def parents_per_domain(self) -> Sequence[Optional[int]]:
+    def parents_per_domain(self) -> Sequence[int | None]:
         """Return a list corresponding to self.domains (by index)
         containing domain indices which are nested around this
         domain.
@@ -331,8 +335,8 @@ def parents_per_domain(self) -> Sequence[Optional[int]]:
         # determines the granularity of inames to be popped/decactivated
         # if we ascend a level.
 
-        iname_set_stack: List[Set[str]] = []
-        result: List[Optional[int]] = []
+        iname_set_stack: list[set[str]] = []
+        result: list[int | None] = []
 
         from loopy.kernel.tools import is_domain_dependent_on_inames
 
@@ -459,7 +463,7 @@ def combine_domains(self, domains: Sequence[int]) -> isl.BasicSet:
 
         return result
 
-    def get_inames_domain(self, inames: FrozenSet[str]) -> isl.BasicSet:
+    def get_inames_domain(self, inames: frozenset[str]) -> isl.BasicSet:
         if not inames:
             return self.combine_domains(())
 
@@ -560,7 +564,7 @@ def all_inames(self):
         return frozenset(self.inames.keys())
 
     @memoize_method
-    def all_params(self) -> FrozenSet[str]:
+    def all_params(self) -> frozenset[str]:
         all_inames = self.all_inames()
 
         result = set()
@@ -758,7 +762,7 @@ def get_unwritten_value_args(self):
     # {{{ argument wrangling
 
     @cached_property
-    def arg_dict(self) -> Dict[str, KernelArgument]:
+    def arg_dict(self) -> dict[str, KernelArgument]:
         return {arg.name: arg for arg in self.args}
 
     @cached_property
@@ -1035,9 +1039,9 @@ def get_grid_size_upper_bounds(self, callables_table, ignore_auto=False,
     def get_grid_size_upper_bounds_as_exprs(
             self, callables_table,
             ignore_auto=False, return_dict=False
-            ) -> Tuple[
-                    Tuple[ArithmeticExpressionT, ...],
-                    Tuple[ArithmeticExpressionT, ...]]:
+            ) -> tuple[
+                    tuple[ArithmeticExpression, ...],
+                    tuple[ArithmeticExpression, ...]]:
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of *all* instructions in the kernel.
 
@@ -1310,7 +1314,7 @@ def __setstate__(self, state):
 
     # {{{ persistent hash key generation / comparison
 
-    hash_fields = [
+    hash_fields: ClassVar[Sequence[str]] = [
             "domains",
             "instructions",
             "args",
@@ -1361,18 +1365,19 @@ def __hash__(self):
 
     # }}}
 
-    def get_copy_kwargs(self, **kwargs: Any) -> Dict[str, Any]:
+    def get_copy_kwargs(self, **kwargs: Any) -> dict[str, Any]:
         if "domains" in kwargs:
             inames = kwargs.get("inames", self.inames)
             domains = kwargs["domains"]
-            kwargs["inames"] = {name: inames.get(name, Iname(name, frozenset()))
+            kwargs["inames"] = {name: inames.get(name,
+                                         loopy.kernel.data.Iname(name, frozenset()))
                                 for name in _get_inames_from_domains(domains)}
 
             assert all(dom.get_ctx() == isl.DEFAULT_CONTEXT for dom in domains)
 
         return kwargs
 
-    def copy(self, **kwargs: Any) -> "LoopKernel":
+    def copy(self, **kwargs: Any) -> LoopKernel:
         result = replace(self, **self.get_copy_kwargs(**kwargs))
 
         object.__setattr__(result, "_cache_manager", self.cache_manager)
@@ -1391,11 +1396,11 @@ def copy(self, **kwargs: Any) -> "LoopKernel":
 
         return result
 
-    def _with_new_tags(self, tags) -> "LoopKernel":
+    def _with_new_tags(self, tags) -> LoopKernel:
         return replace(self, tags=tags)
 
     @memoize_method
-    def _separation_info(self) -> Dict[str, _ArraySeparationInfo]:
+    def _separation_info(self) -> dict[str, _ArraySeparationInfo]:
         return {
                 arg.name: arg._separation_info
                 for arg in self.args
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 8cabbec23..9895685fb 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -24,28 +24,22 @@
 """
 
 import re
-import sys
 from dataclasses import dataclass
 from typing import (
     TYPE_CHECKING,
+    Any,
     Callable,
     ClassVar,
-    FrozenSet,
-    List,
-    Optional,
     Sequence,
     Tuple,
-    Type,
     TypeVar,
-    Union,
     cast,
 )
 from warnings import warn
 
 import numpy as np  # noqa
-from typing_extensions import TypeAlias
+from typing_extensions import Self, TypeAlias
 
-from pymbolic import ArithmeticExpressionT
 from pymbolic.primitives import is_arithmetic_expression
 from pytools import ImmutableRecord
 from pytools.tag import Tag, Taggable
@@ -53,19 +47,17 @@
 from loopy.diagnostic import LoopyError
 from loopy.symbolic import flatten
 from loopy.types import LoopyType
-from loopy.typing import ExpressionT, ShapeType, auto, is_integer
+from loopy.typing import Expression, ShapeType, auto, is_integer
 
 
 if TYPE_CHECKING:
+    from pymbolic import ArithmeticExpression
+
     from loopy.codegen import VectorizationInfo
     from loopy.kernel import LoopKernel
-    from loopy.kernel.data import ArrayArg, TemporaryVariable, auto
+    from loopy.kernel.data import ArrayArg, TemporaryVariable
     from loopy.target import TargetBase
 
-if getattr(sys, "_BUILDING_SPHINX_DOCS", False):
-    from loopy.target import TargetBase  # noqa: F811
-
-
 T = TypeVar("T")
 
 
@@ -93,10 +85,6 @@
 
     See :class:`loopy.typing.ShapeType`
 
-.. class:: ExpressionT
-
-    See :class:`loopy.typing.ExpressionT`
-
 .. class:: Tag
 
     See :class:`pytools.tag.Tag`
@@ -150,7 +138,7 @@ class FixedStrideArrayDimTag(_StrideArrayDimTagBase):
 
         May be one of the following:
 
-        - A :class:`pymbolic.primitives.Expression`, including an
+        - A :data:`~pymbolic.typing.Expression`, including an
           integer, indicating the stride in units of the underlying
           array's :attr:`ArrayBase.dtype`.
 
@@ -609,8 +597,8 @@ def convert_computed_to_fixed_dim_tags(name, num_user_axes, num_target_axes,
 
 # {{{ array base class (for arguments and temporary arrays)
 
-ToShapeLikeConvertible: TypeAlias = (Tuple[ExpressionT | str, ...]
-                | ExpressionT | type[auto] | str | tuple[str, ...])
+ToShapeLikeConvertible: TypeAlias = (Tuple[Expression | str, ...]
+                | Expression | type[auto] | str | tuple[str, ...])
 
 
 def _parse_shape_or_strides(
@@ -634,12 +622,12 @@ def _parse_shape_or_strides(
         raise ValueError("shape can't be a list")
 
     if isinstance(x_parsed, tuple):
-        x_tup: tuple[ExpressionT | str, ...] = x_parsed
+        x_tup: tuple[Expression | str, ...] = x_parsed
     else:
         assert x_parsed is not auto
-        x_tup = (cast(ExpressionT, x_parsed),)
+        x_tup = (cast("Expression", x_parsed),)
 
-    def parse_arith(x: ExpressionT | str) -> ArithmeticExpressionT:
+    def parse_arith(x: Expression | str) -> ArithmeticExpression:
         if isinstance(x, str):
             res = parse(x)
         else:
@@ -677,7 +665,7 @@ class ArrayBase(ImmutableRecord, Taggable):
     """
     name: str
 
-    dtype: Optional[LoopyType]
+    dtype: LoopyType | None
     """The :class:`loopy.types.LoopyType` of the array. If this is *None*,
     :mod:`loopy` will try to continue without knowing the type of this
     array, where the idea is that precise knowledge of the type will become
@@ -689,7 +677,7 @@ class ArrayBase(ImmutableRecord, Taggable):
     cannot be performed without knowledge of the exact *dtype*.
     """
 
-    shape: Union[ShapeType, Type["auto"], None]
+    shape: ShapeType | type[auto] | None
     """
     May be one of the following:
 
@@ -710,11 +698,11 @@ class ArrayBase(ImmutableRecord, Taggable):
       may be *None*.
       """
 
-    dim_tags: Optional[Sequence[ArrayDimImplementationTag]]
+    dim_tags: Sequence[ArrayDimImplementationTag] | None
     """See :ref:`data-dim-tags`.
     """
 
-    offset: Union[ExpressionT, str, None]
+    offset: Expression | str | None
     """Offset from the beginning of the buffer to the point from
     which the strides are counted, in units of the :attr:`dtype`.
     May be one of
@@ -726,7 +714,7 @@ class ArrayBase(ImmutableRecord, Taggable):
       is added automatically, immediately following this argument.
     """
 
-    dim_names: Optional[Tuple[str, ...]]
+    dim_names: tuple[str, ...] | None
     """A tuple of strings providing names for the array axes, or *None*.
     If given, must have the same number of entries as :attr:`dim_tags`
     and :attr:`dim_tags`. These do not live in any particular namespace
@@ -736,7 +724,7 @@ class ArrayBase(ImmutableRecord, Taggable):
     axis numbers.
     """
 
-    alignment: Optional[int]
+    alignment: int | None
     """Memory alignment of the array in bytes. For temporary arrays,
     this ensures they are allocated with this alignment. For arguments,
     this entails a promise that the incoming array obeys this alignment
@@ -751,7 +739,7 @@ class ArrayBase(ImmutableRecord, Taggable):
     .. versionadded:: 2018.1
     """
 
-    tags: FrozenSet[Tag]
+    tags: frozenset[Tag]
     """A (possibly empty) frozenset of instances of
     :class:`pytools.tag.Tag` intended for
     consumption by an application.
@@ -762,7 +750,7 @@ class ArrayBase(ImmutableRecord, Taggable):
     # Note that order may also wind up in attributes, if the
     # number of dimensions has not yet been determined.
 
-    allowed_extra_kwargs: ClassVar[Tuple[str, ...]] = ()
+    allowed_extra_kwargs: ClassVar[tuple[str, ...]] = ()
 
     def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0,
             dim_names=None, strides=None, order=None, for_atomic=False,
@@ -1080,16 +1068,18 @@ def num_user_axes(self, require_answer=True):
         else:
             return None
 
-    def map_exprs(self, mapper):
+    def map_exprs(self, mapper: Callable[[Expression], Expression]) -> Self:
         """Return a copy of self with all expressions replaced with what *mapper*
         transformed them into.
         """
         changed = False
-        kwargs = {}
+        kwargs: dict[str, Any] = {}
         import loopy as lp
 
         if self.shape is not None and self.shape is not lp.auto:
-            def none_pass_mapper(s):
+            assert isinstance(self.shape, tuple)
+
+            def none_pass_mapper(s: Expression | None) -> Expression | None:
                 if s is None:
                     return s
                 else:
@@ -1151,16 +1141,16 @@ def vector_size(self, target: TargetBase) -> int:
 # }}}
 
 def drop_vec_dims(
-        dim_tags: Tuple[ArrayDimImplementationTag, ...],
-        t: Tuple[T, ...]) -> Tuple[T, ...]:
+        dim_tags: tuple[ArrayDimImplementationTag, ...],
+        t: tuple[T, ...]) -> tuple[T, ...]:
     assert len(dim_tags) == len(t)
     return tuple(t_i for dim_tag, t_i in zip(dim_tags, t)
             if not isinstance(dim_tag, VectorArrayDimTag))
 
 
-def get_strides(array: ArrayBase) -> Tuple[ExpressionT, ...]:
+def get_strides(array: ArrayBase) -> tuple[Expression, ...]:
     from pymbolic import var
-    result: List[ExpressionT] = []
+    result: list[Expression] = []
 
     if array.dim_tags is None:
         return ()
@@ -1187,11 +1177,11 @@ def get_strides(array: ArrayBase) -> Tuple[ExpressionT, ...]:
 @dataclass(frozen=True)
 class AccessInfo(ImmutableRecord):
     array_name: str
-    vector_index: Optional[int]
-    subscripts: Tuple[ExpressionT, ...]
+    vector_index: int | None
+    subscripts: tuple[Expression, ...]
 
 
-def _apply_offset(sub: ExpressionT, ary: ArrayBase) -> ExpressionT:
+def _apply_offset(sub: Expression, ary: ArrayBase) -> Expression:
     """
     Helper for :func:`get_access_info`.
     Augments *ary*'s subscript index expression (*sub*) with its offset info.
@@ -1221,16 +1211,16 @@ def _apply_offset(sub: ExpressionT, ary: ArrayBase) -> ExpressionT:
         else:
             # assume it's an expression
             # FIXME: mypy can't figure out that ExpressionT + ExpressionT works
-            return ary.offset + sub  # type: ignore[call-overload, arg-type, operator]  # noqa: E501
+            return ary.offset + sub  # type: ignore[call-overload, arg-type, operator]
     else:
         return sub
 
 
-def get_access_info(kernel: "LoopKernel",
-        ary: Union["ArrayArg", "TemporaryVariable"],
-        index: Union[ExpressionT, Tuple[ExpressionT, ...]],
-        eval_expr: Callable[[ExpressionT], int],
-        vectorization_info: "VectorizationInfo") -> AccessInfo:
+def get_access_info(kernel: LoopKernel,
+        ary: ArrayArg | TemporaryVariable,
+        index: Expression | tuple[Expression, ...],
+        eval_expr: Callable[[Expression], int],
+        vectorization_info: VectorizationInfo) -> AccessInfo:
     """
     :arg ary: an object of type :class:`ArrayBase`
     :arg index: a tuple of indices representing a subscript into ary
@@ -1283,7 +1273,7 @@ def eval_expr_assert_integer_constant(i, expr) -> int:
     num_target_axes = ary.num_target_axes()
 
     vector_index = None
-    subscripts: List[ExpressionT] = [0] * num_target_axes
+    subscripts: list[Expression] = [0] * num_target_axes
 
     vector_size = ary.vector_size(kernel.target)
 
@@ -1302,7 +1292,7 @@ def eval_expr_assert_integer_constant(i, expr) -> int:
 
         index = tuple(remaining_index)
         # only arguments (not temporaries) may be sep-tagged
-        ary = cast(ArrayArg,
+        ary = cast("ArrayArg",
             kernel.arg_dict[ary._separation_info.subarray_names[tuple(sep_index)]])
 
     # }}}
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 4f1803f24..e7228468c 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -1,4 +1,5 @@
 """UI for kernel creation."""
+from __future__ import annotations
 
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
@@ -155,13 +156,13 @@ def expand_defines(insn, defines, single_valued=True):
                             "in this context (when expanding '%s')" % define_name)
 
                 replacements = [
-                        rep+((replace_pattern % define_name, subval),)
+                        (*rep, (replace_pattern % define_name, subval))
                         for rep in replacements
                         for subval in value
                         ]
             else:
                 replacements = [
-                        rep+((replace_pattern % define_name, value),)
+                        (*rep, (replace_pattern % define_name, value))
                         for rep in replacements]
 
     for rep in replacements:
@@ -285,14 +286,12 @@ def parse_nosync_option(opt_value):
                 arrow_idx = value.find("->")
                 if arrow_idx >= 0:
                     result["inames_to_dup"] = (
-                            result.get("inames_to_dup", [])
-                            +
-                            [(value[:arrow_idx], value[arrow_idx+2:])])
+                            [*result.get("inames_to_dup", []),
+                                (value[:arrow_idx], value[arrow_idx + 2:])
+                            ])
                 else:
                     result["inames_to_dup"] = (
-                            result.get("inames_to_dup", [])
-                            +
-                            [(value, None)])
+                            [*result.get("inames_to_dup", []), (value, None)])
 
         elif opt_key == "dep" and opt_value is not None:
             if opt_value.startswith("*"):
@@ -681,7 +680,7 @@ def _count_open_paren_symbols(s):
     for c in s:
         val = _PAREN_PAIRS.get(c)
         if val is not None:
-            increment, cls = val
+            increment, _cls = val
             result += increment
 
     return result
@@ -2403,7 +2402,7 @@ def make_function(domains, instructions, kernel_data=None, **kwargs):
             kernel_args.append(dat)
             continue
 
-        if isinstance(dat, ArrayBase) and isinstance(dat.shape, tuple):  # noqa pylint:disable=no-member
+        if isinstance(dat, ArrayBase) and isinstance(dat.shape, tuple):  # pylint: disable=no-member
             new_shape = []
             for shape_axis in dat.shape:  # pylint:disable=no-member
                 if shape_axis is not None:
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 668e6a07d..913c946ec 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -30,22 +30,18 @@
 from enum import Enum, IntEnum
 from sys import intern
 from typing import (
+    TYPE_CHECKING,
     Any,
     ClassVar,
-    FrozenSet,
-    Optional,
     Sequence,
     Tuple,
-    Type,
     Union,
     cast,
 )
 
 import numpy  # FIXME: imported as numpy to allow sphinx to resolve things
 import numpy as np
-from immutables import Map
 
-from pymbolic import ArithmeticExpressionT, Variable
 from pytools import ImmutableRecord
 from pytools.tag import Tag, Taggable, UniqueTag as UniqueTagBase
 
@@ -64,8 +60,15 @@
     VarAtomicity,
     make_assignment,
 )
-from loopy.types import LoopyType, ToLoopyTypeConvertible
-from loopy.typing import ExpressionT, ShapeType, auto
+from loopy.typing import Expression, ShapeType, auto
+
+
+if TYPE_CHECKING:
+    from immutables import Map
+
+    from pymbolic import ArithmeticExpression, Variable
+
+    from loopy.types import LoopyType, ToLoopyTypeConvertible
 
 
 __doc__ = """
@@ -103,17 +106,17 @@
 
 # {{{ utilities
 
-def _names_from_expr(expr: Union[None, ExpressionT, str]) -> FrozenSet[str]:
+def _names_from_expr(expr: Expression | str | None) -> frozenset[str]:
     from numbers import Number
 
     from loopy.symbolic import DependencyMapper
-    dep_mapper = DependencyMapper()
+    dep_mapper: DependencyMapper[[]] = DependencyMapper()
 
-    from pymbolic.primitives import Expression
+    from pymbolic.primitives import ExpressionNode
     if isinstance(expr, str):
         return frozenset({expr})
-    elif isinstance(expr, Expression):
-        return frozenset(cast(Variable, v).name for v in dep_mapper(expr))
+    elif isinstance(expr, ExpressionNode):
+        return frozenset(cast("Variable", v).name for v in dep_mapper(expr))
     elif expr is None:
         return frozenset()
     elif isinstance(expr, Number):
@@ -123,7 +126,7 @@ def _names_from_expr(expr: Union[None, ExpressionT, str]) -> FrozenSet[str]:
 
 
 def _names_from_dim_tags(
-        dim_tags: Optional[Sequence[ArrayDimImplementationTag]]) -> FrozenSet[str]:
+        dim_tags: Sequence[ArrayDimImplementationTag] | None) -> frozenset[str]:
     from loopy.kernel.array import FixedStrideArrayDimTag
     if dim_tags is not None:
         return frozenset({
@@ -171,7 +174,7 @@ def strify_tag_type():
 
 
 class InameImplementationTag(ImmutableRecord, UniqueTagBase):
-    __slots__: ClassVar[Tuple[str, ...]] = ()
+    __slots__: ClassVar[tuple[str, ...]] = ()
 
     def __hash__(self):
         return hash(self.key)
@@ -303,10 +306,10 @@ def __str__(self):
         return "ord"
 
 
-ToInameTagConvertible = Union[str, None, Tag]
+ToInameTagConvertible = Union[str, Tag, None]
 
 
-def parse_tag(tag: ToInameTagConvertible) -> Optional[Tag]:
+def parse_tag(tag: ToInameTagConvertible) -> Tag | None:
     if tag is None:
         return tag
 
@@ -365,7 +368,7 @@ class AddressSpace(IntEnum):
     GLOBAL = 2
 
     @classmethod
-    def stringify(cls, val: Union["AddressSpace", Type[auto]]) -> str:
+    def stringify(cls, val: AddressSpace | type[auto]) -> str:
         if val == cls.PRIVATE:
             return "private"
         elif val == cls.LOCAL:
@@ -397,7 +400,7 @@ class KernelArgument(ImmutableRecord):
     .. automethod:: supporting_names
     """
     name: str
-    dtype: Optional[LoopyType]
+    dtype: LoopyType | None
     is_output: bool
     is_input: bool
 
@@ -422,7 +425,7 @@ def __init__(self, **kwargs):
 
         ImmutableRecord.__init__(self, **kwargs)
 
-    def supporting_names(self) -> FrozenSet[str]:
+    def supporting_names(self) -> frozenset[str]:
         """'Supporting' names are those that are likely to be required to be
         present for any use of the argument.
         """
@@ -437,12 +440,12 @@ class _ArraySeparationInfo:
     this records the names of the actually present sub-arrays that
     should be used to realize this array.
     """
-    sep_axis_indices_set: FrozenSet[int]
-    subarray_names: Map[Tuple[int, ...], str]
+    sep_axis_indices_set: frozenset[int]
+    subarray_names: Map[tuple[int, ...], str]
 
 
 class ArrayArg(ArrayBase, KernelArgument):
-    __doc__ = cast(str, ArrayBase.__doc__) + (
+    __doc__ = cast("str", ArrayBase.__doc__) + (
         """
         .. attribute:: address_space
 
@@ -465,7 +468,7 @@ class ArrayArg(ArrayBase, KernelArgument):
     address_space: AddressSpace
 
     # _separation_info is not user-facing and hence not documented.
-    _separation_info: Optional[_ArraySeparationInfo]
+    _separation_info: _ArraySeparationInfo | None
 
     allowed_extra_kwargs = (
             "address_space",
@@ -517,7 +520,7 @@ def update_persistent_hash(self, key_hash, key_builder):
         key_builder.rec(key_hash, self.is_input)
         key_builder.rec(key_hash, self._separation_info)
 
-    def supporting_names(self) -> FrozenSet[str]:
+    def supporting_names(self) -> frozenset[str]:
         # Do not consider separation info here: The subarrays don't support, they
         # replace this array.
         return (
@@ -580,7 +583,7 @@ def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written):
         return ast_builder.get_image_arg_decl(self.name + name_suffix, shape,
                 self.num_target_axes(), dtype, is_written)
 
-    def supporting_names(self) -> FrozenSet[str]:
+    def supporting_names(self) -> frozenset[str]:
         return (
                 _names_from_expr(self.offset)
                 | _names_from_dim_tags(self.dim_tags)
@@ -644,7 +647,7 @@ def get_arg_decl(self, ast_builder):
 # {{{ temporary variable
 
 class TemporaryVariable(ArrayBase):
-    __doc__ = cast(str, ArrayBase.__doc__) + """
+    __doc__ = cast("str", ArrayBase.__doc__) + """
     .. autoattribute:: storage_shape
     .. autoattribute:: base_indices
     .. autoattribute:: address_space
@@ -654,17 +657,17 @@ class TemporaryVariable(ArrayBase):
     .. autoattribute:: _base_storage_access_may_be_aliasing
     """
 
-    storage_shape: Optional[ShapeType]
-    base_indices: Optional[Tuple[ExpressionT, ...]]
-    address_space: Union[AddressSpace, Type[auto]]
-    base_storage: Optional[str]
+    storage_shape: ShapeType | None
+    base_indices: tuple[Expression, ...] | None
+    address_space: AddressSpace | type[auto]
+    base_storage: str | None
     """The name of a storage array that is to be used to actually
     hold the data in this temporary, or *None*. If not *None* or the name
     of an existing variable, a variable of this name and appropriate size
     will be created.
     """
 
-    initializer: Optional[numpy.ndarray]
+    initializer: numpy.ndarray | None
     """*None* or a :class:`numpy.ndarray` of data to be used to initialize the
     array.
     """
@@ -699,19 +702,19 @@ def __init__(
                 self,
                 name: str,
                 dtype: ToLoopyTypeConvertible = None,
-                shape: Union[ShapeType, Type["auto"], None] = auto,
-                address_space: Union[AddressSpace, Type[auto], None] = None,
-                dim_tags: Optional[Sequence[ArrayDimImplementationTag]] = None,
-                offset: Union[ExpressionT, str, None] = 0,
-                dim_names: Optional[Tuple[str, ...]] = None,
-                strides: Optional[Tuple[ExpressionT, ...]] = None,
+                shape: ShapeType | type[auto] | None = auto,
+                address_space: AddressSpace | type[auto] | None = None,
+                dim_tags: Sequence[ArrayDimImplementationTag] | None = None,
+                offset: Expression | str | None = 0,
+                dim_names: tuple[str, ...] | None = None,
+                strides: tuple[Expression, ...] | None = None,
                 order: str | None = None,
 
-                base_indices: Optional[Tuple[ExpressionT, ...]] = None,
+                base_indices: tuple[Expression, ...] | None = None,
                 storage_shape: ShapeType | None = None,
 
-                base_storage: Optional[str] = None,
-                initializer: Optional[np.ndarray] = None,
+                base_storage: str | None = None,
+                initializer: np.ndarray | None = None,
                 read_only: bool = False,
 
                 _base_storage_access_may_be_aliasing: bool = False,
@@ -813,7 +816,7 @@ def copy(self, **kwargs: Any) -> TemporaryVariable:
         return super().copy(**kwargs)
 
     @property
-    def nbytes(self) -> ExpressionT:
+    def nbytes(self) -> Expression:
         if self.storage_shape is not None:
             shape = self.storage_shape
         else:
@@ -821,7 +824,7 @@ def nbytes(self) -> ExpressionT:
                 raise ValueError("shape is None")
             if self.shape is auto:
                 raise ValueError("shape is auto")
-            shape = cast(Tuple[ArithmeticExpressionT], self.shape)
+            shape = cast("Tuple[ArithmeticExpression]", self.shape)
 
         if self.dtype is None:
             raise ValueError("data type is indeterminate")
@@ -878,7 +881,7 @@ def update_persistent_hash(self, key_hash, key_builder):
         key_builder.rec(key_hash, self.read_only)
         key_builder.rec(key_hash, self._base_storage_access_may_be_aliasing)
 
-    def supporting_names(self) -> FrozenSet[str]:
+    def supporting_names(self) -> frozenset[str]:
         return (
                 _names_from_expr(self.offset)
                 | _names_from_dim_tags(self.dim_tags)
@@ -902,7 +905,7 @@ class SubstitutionRule:
 
     name: str
     arguments: Sequence[str]
-    expression: ExpressionT
+    expression: Expression
 
     def copy(self, **kwargs: Any) -> SubstitutionRule:
         return replace(self, **kwargs)
@@ -970,9 +973,9 @@ class Iname(Taggable):
         An instance of :class:`frozenset` of :class:`pytools.tag.Tag`.
     """
     name: str
-    tags: FrozenSet[Tag]
+    tags: frozenset[Tag]
 
-    def copy(self, **kwargs: Any) -> "Iname":
+    def copy(self, **kwargs: Any) -> Iname:
         return replace(self, **kwargs)
 
     def _with_new_tags(self, tags):
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index 22abeb8ae..146d40f4f 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -22,38 +22,41 @@
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
-
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Callable, ClassVar, FrozenSet, Tuple, TypeVar
+from dataclasses import dataclass, replace
+from typing import TYPE_CHECKING, Any, Callable, TypeVar
+from warnings import warn
 
-from pytools import ImmutableRecord
+from immutabledict import immutabledict
+from typing_extensions import Self
 
 from loopy.diagnostic import LoopyError
-from loopy.kernel import LoopKernel
-from loopy.kernel.array import ArrayBase
-from loopy.kernel.data import ArrayArg, ValueArg
+from loopy.kernel.array import ArrayBase, ArrayDimImplementationTag
+from loopy.kernel.data import AddressSpace, ArrayArg, ValueArg
 from loopy.symbolic import DependencyMapper, WalkMapper
-from loopy.tools import update_persistent_hash
 
 
 if TYPE_CHECKING:
+    from collections.abc import Mapping, Sequence
+
     from typing_extensions import Self
 
+    from loopy.kernel import LoopKernel
     from loopy.translation_unit import CallablesTable, FunctionIdT
+    from loopy.types import LoopyType
+    from loopy.typing import ShapeType
 
 __doc__ = """
 .. currentmodule:: loopy.kernel.function_interface
 
+.. autoclass:: ArgDescriptor
 .. autoclass:: ValueArgDescriptor
-
 .. autoclass:: ArrayArgDescriptor
 
 .. currentmodule:: loopy
 
 .. autoclass:: InKernelCallable
-
 .. autoclass:: CallableKernel
-
 .. autoclass:: ScalarCallable
 """
 
@@ -63,7 +66,7 @@
 ArgDescriptorT = TypeVar("ArgDescriptorT", bound="ArgDescriptor")
 
 
-class ArgDescriptor(ABC, ImmutableRecord):
+class ArgDescriptor(ABC):
     @abstractmethod
     def map_expr(
                 self,
@@ -75,19 +78,25 @@ def map_expr(
     def depends_on(self) -> frozenset[str]:
         ...
 
+    @abstractmethod
+    def copy(self, **kwargs: Any) -> Self:
+        ...
 
+
+@dataclass(frozen=True)
 class ValueArgDescriptor(ArgDescriptor):
-    hash_fields = ()
 
     def map_expr(self, subst_mapper):
-        return self.copy()
+        return self
 
     def depends_on(self):
         return frozenset()
 
-    update_persistent_hash = update_persistent_hash
+    def copy(self, **kwargs: Any) -> Self:
+        return replace(self, **kwargs)
 
 
+@dataclass(frozen=True)
 class ArrayArgDescriptor(ArgDescriptor):
     """
     Records information about an array argument to an in-kernel callable. To be
@@ -95,46 +104,39 @@ class ArrayArgDescriptor(ArgDescriptor):
     :meth:`~loopy.InKernelCallable.with_descrs`, used for
     matching shape and address space of caller and callee kernels.
 
-    .. attribute:: shape
-
-        Shape of the array.
-
-    .. attribute:: address_space
-
-        An attribute of :class:`loopy.AddressSpace`.
-
-    .. attribute:: dim_tags
-
-        A tuple of instances of
-        :class:`loopy.kernel.array.ArrayDimImplementationTag`
+    .. autoattribute:: shape
+    .. autoattribute:: address_space
+    .. autoattribute:: dim_tags
 
     .. automethod:: map_expr
     .. automethod:: depends_on
     """
 
-    fields = {"shape", "address_space", "dim_tags"}
-
-    def __init__(self, shape, address_space, dim_tags):
+    shape: ShapeType | None
+    address_space: AddressSpace
+    dim_tags: Sequence[ArrayDimImplementationTag] | None
+    """See :ref:`data-dim-tags`.
+    """
 
-        # {{{ sanity checks
+    if __debug__:
+        def __post_init__(self):
+            # {{{ sanity checks
 
-        from loopy.kernel.array import ArrayDimImplementationTag
-        from loopy.kernel.data import auto
+            from loopy.kernel.array import ArrayDimImplementationTag
+            from loopy.kernel.data import auto
 
-        assert isinstance(shape, tuple) or shape in [None, auto]
-        assert isinstance(dim_tags, tuple) or dim_tags is None
+            assert isinstance(self.shape, tuple) or self.shape in [None, auto]
+            assert isinstance(self.dim_tags, tuple) or self.dim_tags is None
 
-        if dim_tags:
-            # FIXME at least vector dim tags should be supported
-            assert all(isinstance(dim_tag, ArrayDimImplementationTag) for dim_tag in
-                    dim_tags)
+            if self.dim_tags:
+                # FIXME at least vector dim tags should be supported
+                assert all(isinstance(dim_tag, ArrayDimImplementationTag) for dim_tag in
+                        self.dim_tags)
 
-        # }}}
+            # }}}
 
-        super().__init__(
-                shape=shape,
-                address_space=address_space,
-                dim_tags=dim_tags)
+    def copy(self, **kwargs: Any) -> Self:
+        return replace(self, **kwargs)
 
     def map_expr(self, f):
         """
@@ -173,11 +175,6 @@ def depends_on(self):
 
         return frozenset(var.name for var in result)
 
-    def update_persistent_hash(self, key_hash, key_builder):
-        key_builder.rec(key_hash, self.shape)
-        key_builder.rec(key_hash, self.address_space)
-        key_builder.rec(key_hash, self.dim_tags)
-
 
 class ExpressionIsScalarChecker(WalkMapper):
     def __init__(self, kernel):
@@ -308,25 +305,14 @@ def get_kw_pos_association(kernel):
 
 # {{{ template class
 
-class InKernelCallable(ImmutableRecord):
+@dataclass(frozen=True, init=False)
+class InKernelCallable(ABC):
     """
     An abstract interface to define a callable encountered in a kernel.
 
-    .. attribute:: name
-
-        The name of the callable which can be encountered within expressions in
-        a kernel.
-
-    .. attribute:: arg_id_to_dtype
-
-        A mapping which indicates the arguments types and result types of the
-        callable.
-
-    .. attribute:: arg_id_to_descr
-
-        A mapping which gives indicates the argument shape and ``dim_tags`` it
-        would be responsible for generating code.
-
+    .. autoattribute:: name
+    .. autoattribute:: arg_id_to_dtype
+    .. autoattribute:: arg_id_to_descr
 
     .. automethod:: __init__
     .. automethod:: with_types
@@ -352,17 +338,39 @@ class InKernelCallable(ImmutableRecord):
           return value with (0-based) index *i*.
 
     """
+    arg_id_to_dtype: Mapping[int | str, LoopyType] | None
+    arg_id_to_descr: Mapping[int | str, ArgDescriptor] | None
+
+    def __init__(self,
+                 arg_id_to_dtype: Mapping[int | str, LoopyType] | None = None,
+                 arg_id_to_descr: Mapping[int | str, ArgDescriptor] | None = None,
+             ) -> None:
+        try:
+            hash(arg_id_to_dtype)
+        except TypeError:
+            arg_id_to_dtype = immutabledict(arg_id_to_dtype)
+            warn("arg_id_to_dtype passed to InKernelCallable was not hashable. "
+                 "This usage is deprecated and will stop working in 2026.",
+                 DeprecationWarning, stacklevel=3)
+
+        try:
+            hash(arg_id_to_descr)
+        except TypeError:
+            arg_id_to_descr = immutabledict(arg_id_to_descr)
+            warn("arg_id_to_descr passed to InKernelCallable was not hashable. "
+                 "This usage is deprecated and will stop working in 2026.",
+                 DeprecationWarning, stacklevel=3)
+
+        object.__setattr__(self, "arg_id_to_dtype", arg_id_to_dtype)
+        object.__setattr__(self, "arg_id_to_descr", arg_id_to_descr)
+
+    if TYPE_CHECKING:
+        @property
+        def name(self) -> str:
+            raise NotImplementedError()
 
-    hash_fields: ClassVar[Tuple[str, ...]] = (
-            "name", "arg_id_to_dtype", "arg_id_to_descr")
-
-    def __init__(self, name, arg_id_to_dtype=None, arg_id_to_descr=None):
-
-        super().__init__(name=name,
-                         arg_id_to_dtype=arg_id_to_dtype,
-                         arg_id_to_descr=arg_id_to_descr)
-
-    update_persistent_hash = update_persistent_hash
+    def copy(self, **kwargs: Any) -> Self:
+        return replace(self, **kwargs)
 
     def with_types(self, arg_id_to_dtype, clbl_inf_ctx):
         """
@@ -391,6 +399,7 @@ def with_types(self, arg_id_to_dtype, clbl_inf_ctx):
 
         raise NotImplementedError()
 
+    @abstractmethod
     def with_descrs(self, arg_id_to_descr, clbl_inf_ctx):
         """
         :arg arg_id_to_descr: a mapping from argument identifiers (integers for
@@ -418,12 +427,11 @@ def with_descrs(self, arg_id_to_descr, clbl_inf_ctx):
             other callables within it, then *clbl_inf_ctx* is returned as is.
         """
 
-        raise NotImplementedError()
-
-    def is_ready_for_codegen(self):
+    def is_ready_for_codegen(self) -> bool:
         return (self.arg_id_to_dtype is not None and
                 self.arg_id_to_descr is not None)
 
+    @abstractmethod
     def get_hw_axes_sizes(self, arg_id_to_arg, space, callables_table):
         """
         Returns ``gsizes, lsizes``, where *gsizes* and *lsizes* are mappings
@@ -435,26 +443,28 @@ def get_hw_axes_sizes(self, arg_id_to_arg, space, callables_table):
             arguments at a call-site.
         :arg space: An instance of :class:`islpy.Space`.
         """
-        raise NotImplementedError
+        ...
 
+    @abstractmethod
     def get_used_hw_axes(self, callables_table):
         """
         Returns a tuple ``group_axes_used, local_axes_used``, where
         ``(group|local)_axes_used`` are :class:`frozenset` of hardware axes
         indices used by the callable.
         """
-        raise NotImplementedError
 
+    @abstractmethod
     def generate_preambles(self, target):
         """
         Yields the target specific preamble.
         """
         raise NotImplementedError()
 
+    @abstractmethod
     def emit_call(self, expression_to_code_mapper, expression, target):
+        ...
 
-        raise NotImplementedError()
-
+    @abstractmethod
     def emit_call_insn(self, insn, target, expression_to_code_mapper):
         """
         Returns a tuple of ``(call, assignee_is_returned)`` which is the target
@@ -469,23 +479,19 @@ def emit_call_insn(self, insn, target, expression_to_code_mapper):
             in the target as the statement ``f(c, d, &a, &b)``.
         """
 
-        raise NotImplementedError()
-
-    def __hash__(self):
-        return hash(self.hash_fields)
-
+    @abstractmethod
     def with_added_arg(self, arg_dtype, arg_descr):
         """
         Registers a new argument to the callable and returns the name of the
         argument in the callable's namespace.
         """
-        raise NotImplementedError()
 
+    @abstractmethod
     def get_called_callables(
                              self,
                              callables_table: CallablesTable,
                              recursive: bool = True
-                         ) -> FrozenSet[FunctionIdT]:
+                         ) -> frozenset[FunctionIdT]:
         """
         Returns a :class:`frozenset` of callable ids called by *self* that are
         resolved via *callables_table*.
@@ -496,27 +502,27 @@ def get_called_callables(
             callables, else only returns the callables directly called by
             *self*.
         """
-        raise NotImplementedError
 
+    @abstractmethod
     def with_name(self, name):
         """
         Returns a copy of *self* so that it could be referred by *name*
         in a :attr:`loopy.TranslationUnit.callables_table`'s namespace.
         """
-        raise NotImplementedError
 
+    @abstractmethod
     def is_type_specialized(self):
         """
         Returns *True* iff *self*'s type signature is known, else returns
         *False*.
         """
-        raise NotImplementedError
 
 # }}}
 
 
 # {{{ scalar callable
 
+@dataclass(frozen=True, init=False)
 class ScalarCallable(InKernelCallable):
     """
     An abstract interface to a scalar callable encountered in a kernel.
@@ -537,15 +543,20 @@ class ScalarCallable(InKernelCallable):
         The :meth:`ScalarCallable.with_types` is intended to assist with type
         specialization of the function and sub-classes must define it.
     """
-    fields = {"name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"}
-    hash_fields = InKernelCallable.hash_fields + ("name_in_target",)
-
-    def __init__(self, name, arg_id_to_dtype=None,
-                 arg_id_to_descr=None, name_in_target=None):
-        super().__init__(name=name,
-                         arg_id_to_dtype=arg_id_to_dtype,
-                         arg_id_to_descr=arg_id_to_descr)
-        self.name_in_target = name_in_target
+    name: str
+    name_in_target: str | None
+
+    def __init__(self,
+                 name: str,
+                 arg_id_to_dtype: Mapping[int | str, LoopyType] | None = None,
+                 arg_id_to_descr: Mapping[int | str, ArgDescriptor] | None = None,
+                 name_in_target: str | None = None) -> None:
+        super().__init__(
+            arg_id_to_dtype=arg_id_to_dtype,
+            arg_id_to_descr=arg_id_to_descr,
+        )
+        object.__setattr__(self, "name", name)
+        object.__setattr__(self, "name_in_target", name_in_target)
 
     def with_types(self, arg_id_to_dtype, callables_table):
         raise LoopyError("No type inference information present for "
@@ -689,6 +700,7 @@ def is_type_specialized(self):
 
 # {{{ callable kernel
 
+@dataclass(frozen=True, init=False)
 class CallableKernel(InKernelCallable):
     """
     Records information about a callee kernel. Also provides interface through
@@ -702,35 +714,27 @@ class CallableKernel(InKernelCallable):
     :meth:`CallableKernel.with_descrs` should be called in order to match
     the arguments' shapes/strides across the caller and the callee kernel.
 
-    .. attribute:: subkernel
-
-        :class:`~loopy.LoopKernel` which is being called.
-
+    .. autoattribute:: subkernel
     .. automethod:: with_descrs
     .. automethod:: with_types
     """
 
-    fields = {"subkernel", "arg_id_to_dtype", "arg_id_to_descr"}
-    hash_fields = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr")
+    subkernel: LoopKernel
+
+    def __init__(self,
+                 subkernel: LoopKernel,
+                 arg_id_to_dtype: Mapping[int | str, LoopyType] | None = None,
+                 arg_id_to_descr: Mapping[int | str, ArgDescriptor] | None = None,
+             ) -> None:
 
-    def __init__(self, subkernel, arg_id_to_dtype=None,
-                 arg_id_to_descr=None):
-        assert isinstance(subkernel, LoopKernel)
-        super().__init__(name=subkernel.name,
+        super().__init__(
                          arg_id_to_dtype=arg_id_to_dtype,
                          arg_id_to_descr=arg_id_to_descr)
-        self.subkernel = subkernel
-
-    def copy(self, subkernel=None, arg_id_to_dtype=None,
-             arg_id_to_descr=None):
-        if subkernel is None:
-            subkernel = self.subkernel
-        if arg_id_to_descr is None:
-            arg_id_to_descr = self.arg_id_to_descr
-        if arg_id_to_dtype is None:
-            arg_id_to_dtype = self.arg_id_to_dtype
+        object.__setattr__(self, "subkernel", subkernel)
 
-        return CallableKernel(subkernel, arg_id_to_dtype, arg_id_to_descr)
+    @property
+    def name(self) -> str:
+        return self.subkernel.name
 
     def with_types(self, arg_id_to_dtype, callables_table):
         kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
@@ -769,7 +773,7 @@ def with_types(self, arg_id_to_dtype, callables_table):
         # Return the kernel call with specialized subkernel and the corresponding
         # new arg_id_to_dtype
         return self.copy(subkernel=specialized_kernel,
-                arg_id_to_dtype=new_arg_id_to_dtype), callables_table
+                arg_id_to_dtype=immutabledict(new_arg_id_to_dtype)), callables_table
 
     def with_descrs(self, arg_id_to_descr, clbl_inf_ctx):
 
@@ -844,7 +848,7 @@ def with_descrs(self, arg_id_to_descr, clbl_inf_ctx):
         # }}}
 
         return (self.copy(subkernel=subkernel,
-                          arg_id_to_descr=arg_id_to_descr),
+                          arg_id_to_descr=immutabledict(arg_id_to_descr)),
                 clbl_inf_ctx)
 
     def with_added_arg(self, arg_dtype, arg_descr):
@@ -852,19 +856,20 @@ def with_added_arg(self, arg_dtype, arg_descr):
 
         if isinstance(arg_descr, ValueArgDescriptor):
             subknl = self.subkernel.copy(
-                    args=self.subkernel.args+[
+                    args=[
+                        *self.subkernel.args,
                         ValueArg(var_name, arg_dtype, self.subkernel.target)])
 
-            kw_to_pos, pos_to_kw = get_kw_pos_association(subknl)
+            kw_to_pos, _pos_to_kw = get_kw_pos_association(subknl)
 
             if self.arg_id_to_dtype is None:
                 arg_id_to_dtype = {}
             else:
-                arg_id_to_dtype = self.arg_id_to_dtype.copy()
+                arg_id_to_dtype = dict(self.arg_id_to_dtype)
             if self.arg_id_to_descr is None:
                 arg_id_to_descr = {}
             else:
-                arg_id_to_descr = self.arg_id_to_descr.copy()
+                arg_id_to_descr = dict(self.arg_id_to_descr)
 
             arg_id_to_dtype[var_name] = arg_dtype
             arg_id_to_descr[var_name] = arg_descr
@@ -883,7 +888,7 @@ def with_added_arg(self, arg_dtype, arg_descr):
 
     def with_packing_for_args(self):
         from loopy.kernel.data import AddressSpace
-        kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
+        _kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
 
         arg_id_to_descr = {}
 
@@ -931,6 +936,10 @@ def generate_preambles(self, target):
         return
         yield
 
+    def emit_call(self, expression_to_code_mapper, expression, target):
+        raise LoopyError("Kernel '{self.name}' cannot be called "
+                         "from within an expression, use a call statement")
+
     def emit_call_insn(self, insn, target, expression_to_code_mapper):
         from loopy.target.c import CFamilyTarget
         if not isinstance(target, CFamilyTarget):
@@ -947,7 +956,7 @@ def emit_call_insn(self, insn, target, expression_to_code_mapper):
 
         parameters = list(parameters)
         par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)]
-        kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
+        _kw_to_pos, _pos_to_kw = get_kw_pos_association(self.subkernel)
 
         # insert the assignees at the required positions
         assignee_write_count = -1
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 51d4856da..f882c09f3 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
 
 __license__ = """
@@ -27,7 +30,13 @@
 from dataclasses import dataclass
 from functools import cached_property
 from sys import intern
-from typing import Any, FrozenSet, Mapping, Optional, Sequence, Tuple, Type, Union
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    ClassVar,
+    Mapping,
+    Sequence,
+)
 from warnings import warn
 
 import islpy as isl
@@ -36,8 +45,11 @@
 
 from loopy.diagnostic import LoopyError
 from loopy.tools import Optional as LoopyOptional
-from loopy.types import LoopyType
-from loopy.typing import ExpressionT, InameStr
+
+
+if TYPE_CHECKING:
+    from loopy.types import LoopyType
+    from loopy.typing import Expression, InameStr
 
 
 # {{{ instruction tags
@@ -115,8 +127,8 @@ class HappensAfter:
         statement-level dependencies of prior versions of :mod:`loopy`.
     """
 
-    variable_name: Optional[str]
-    instances_rel: Optional[isl.Map]
+    variable_name: str | None
+    instances_rel: isl.Map | None
 
 # }}}
 
@@ -244,20 +256,20 @@ class InstructionBase(ImmutableRecord, Taggable):
 
     Inherits from :class:`pytools.tag.Taggable`.
     """
-    id: Optional[str]
+    id: str | None
     happens_after: Mapping[str, HappensAfter]
     depends_on_is_final: bool
-    groups: FrozenSet[str]
-    conflicts_with_groups: FrozenSet[str]
-    no_sync_with: FrozenSet[Tuple[str, str]]
-    predicates: FrozenSet[ExpressionT]
-    within_inames: FrozenSet[InameStr]
+    groups: frozenset[str]
+    conflicts_with_groups: frozenset[str]
+    no_sync_with: frozenset[tuple[str, str]]
+    predicates: frozenset[Expression]
+    within_inames: frozenset[InameStr]
     within_inames_is_final: bool
     priority: int
 
     # within_inames_is_final is deprecated and will be removed in version 2017.x.
 
-    fields = set("id depends_on_is_final "
+    fields: ClassVar[set[str]] = set("id depends_on_is_final "
             "groups conflicts_with_groups "
             "no_sync_with "
             "predicates "
@@ -265,20 +277,20 @@ class InstructionBase(ImmutableRecord, Taggable):
             "priority".split())
 
     def __init__(self,
-                 id: Optional[str],
-                 happens_after: Union[
-                     Mapping[str, HappensAfter], FrozenSet[str], str, None],
-                 depends_on_is_final: Optional[bool],
-                 groups: Optional[FrozenSet[str]],
-                 conflicts_with_groups: Optional[FrozenSet[str]],
-                 no_sync_with: Optional[FrozenSet[Tuple[str, str]]],
-                 within_inames_is_final: Optional[bool],
-                 within_inames: Optional[FrozenSet[str]],
-                 priority: Optional[int],
-                 predicates: Optional[FrozenSet[str]],
-                 tags: Optional[FrozenSet[Tag]],
+                 id: str | None,
+                 happens_after: (
+                     Mapping[str, HappensAfter] | frozenset[str] | str | None),
+                 depends_on_is_final: bool | None,
+                 groups: frozenset[str] | None,
+                 conflicts_with_groups: frozenset[str] | None,
+                 no_sync_with: frozenset[tuple[str, str]] | None,
+                 within_inames_is_final: bool | None,
+                 within_inames: frozenset[str] | None,
+                 priority: int | None,
+                 predicates: frozenset[str] | None,
+                 tags: frozenset[Tag] | None,
                  *,
-                 depends_on: Union[FrozenSet[str], str, None] = None,
+                 depends_on: frozenset[str] | str | None = None,
                  ) -> None:
         from immutabledict import immutabledict
 
@@ -434,7 +446,7 @@ def read_dependency_names(self):
 
         return result
 
-    def reduction_inames(self) -> FrozenSet[str]:
+    def reduction_inames(self) -> frozenset[str]:
         raise NotImplementedError
 
     def sub_array_ref_inames(self):
@@ -572,7 +584,7 @@ def __setstate__(self, val):
         self.within_inames = (
                 intern_frozenset_of_ids(self.within_inames))
 
-    def _with_new_tags(self, tags: FrozenSet[Tag]):
+    def _with_new_tags(self, tags: frozenset[Tag]):
         return self.copy(tags=tags)
 
 # }}}
@@ -634,7 +646,7 @@ def _get_assignee_subscript_deps(expr):
 
 # {{{ atomic ops
 
-class MemoryOrdering:  # noqa
+class MemoryOrdering:
     """Ordering of atomic operations, defined as in C11 and OpenCL.
 
     .. attribute:: RELAXED
@@ -662,7 +674,7 @@ def to_string(v):
         raise ValueError("Unknown value of MemoryOrdering")
 
 
-class MemoryScope:  # noqa
+class MemoryScope:
     """Scope of atomicity, defined as in OpenCL.
 
     .. attribute:: auto
@@ -901,35 +913,35 @@ class Assignment(MultiAssignmentBase):
     .. automethod:: __init__
     """
 
-    assignee: ExpressionT
-    expression: ExpressionT
+    assignee: Expression
+    expression: Expression
     temp_var_type: LoopyOptional
-    atomicity: Tuple[VarAtomicity, ...]
+    atomicity: tuple[VarAtomicity, ...]
 
     fields = MultiAssignmentBase.fields | \
             set("assignee temp_var_type atomicity".split())
 
     def __init__(self,
-                 assignee: Union[str, ExpressionT],
-                 expression: Union[str, ExpressionT],
-                 id: Optional[str] = None,
-                 happens_after: Union[
-                     Mapping[str, HappensAfter], FrozenSet[str], str, None] = None,
-                 depends_on_is_final: Optional[bool] = None,
-                 groups: Optional[FrozenSet[str]] = None,
-                 conflicts_with_groups: Optional[FrozenSet[str]] = None,
-                 no_sync_with: Optional[FrozenSet[Tuple[str, str]]] = None,
-                 within_inames_is_final: Optional[bool] = None,
-                 within_inames: Optional[FrozenSet[str]] = None,
-                 priority: Optional[int] = None,
-                 predicates: Optional[FrozenSet[str]] = None,
-                 tags: Optional[FrozenSet[Tag]] = None,
-                 temp_var_type: Union[
-                     Type[_not_provided], None, LoopyOptional,
-                     LoopyType] = _not_provided,
-                 atomicity: Tuple[VarAtomicity, ...] = (),
+                 assignee: str | Expression,
+                 expression: str | Expression,
+                 id: str | None = None,
+                 happens_after:
+                     Mapping[str, HappensAfter] | frozenset[str] | str | None = None,
+                 depends_on_is_final: bool | None = None,
+                 groups: frozenset[str] | None = None,
+                 conflicts_with_groups: frozenset[str] | None = None,
+                 no_sync_with: frozenset[tuple[str, str]] | None = None,
+                 within_inames_is_final: bool | None = None,
+                 within_inames: frozenset[str] | None = None,
+                 priority: int | None = None,
+                 predicates: frozenset[str] | None = None,
+                 tags: frozenset[Tag] | None = None,
+                 temp_var_type:
+                    type[_not_provided] | LoopyOptional | LoopyType | None
+                    = _not_provided,
+                 atomicity: tuple[VarAtomicity, ...] = (),
                  *,
-                 depends_on: Union[FrozenSet[str], str, None] = None,
+                 depends_on: frozenset[str] | str | None = None,
                  ) -> None:
 
         if temp_var_type is _not_provided:
@@ -1271,8 +1283,8 @@ def modify_assignee_for_array_call(assignee):
                 "SubArrayRef as its inputs")
 
 
-def make_assignment(assignees: tuple[ExpressionT, ...],
-                    expression: ExpressionT,
+def make_assignment(assignees: tuple[Expression, ...],
+                    expression: Expression,
                     temp_var_types: (
                         Sequence[LoopyType | None] | None) = None,
                     **kwargs: Any) -> Assignment | CallInstruction:
@@ -1372,7 +1384,7 @@ class CInstruction(InstructionBase):
     .. attribute:: assignees
 
         A sequence (typically a :class:`tuple`) of variable references (with or
-        without subscript) as :class:`pymbolic.primitives.Expression` instances
+        without subscript) as :data:`pymbolic.typing.Expression` instances
         that :attr:`code` writes to. This is optional and only used for
         figuring out dependencies.
     """
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 9a14aedd5..c48da4be9 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -1,4 +1,5 @@
 """Operations on the kernel object."""
+from __future__ import annotations
 
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
@@ -27,14 +28,13 @@
 import sys
 from functools import reduce
 from sys import intern
-from typing import AbstractSet, Dict, FrozenSet, List, Mapping, Sequence, Set
+from typing import TYPE_CHECKING, AbstractSet, Mapping, Sequence
 
 import numpy as np
 
 import islpy as isl
 from islpy import dim_type
 from pytools import memoize_on_first_arg, natsorted
-from pytools.tag import Tag
 
 from loopy.diagnostic import LoopyError, warn_with_kernel
 from loopy.kernel import LoopKernel
@@ -46,7 +46,12 @@
 )
 from loopy.symbolic import CombineMapper
 from loopy.translation_unit import TranslationUnit, TUnitOrKernelT, for_each_kernel
-from loopy.types import ToLoopyTypeConvertible
+
+
+if TYPE_CHECKING:
+    from pytools.tag import Tag
+
+    from loopy.types import ToLoopyTypeConvertible
 
 
 logger = logging.getLogger(__name__)
@@ -100,7 +105,7 @@ def add_dtypes(
 
 
 def _add_dtypes_overdetermined(kernel, dtype_dict):
-    dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(kernel, dtype_dict)
+    _dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(kernel, dtype_dict)
     # do not throw error for unused args
     return kernel.copy(args=new_args, temporary_variables=new_temp_vars)
 
@@ -1823,7 +1828,7 @@ def get_subkernels(kernel) -> Sequence[str]:
 
 
 @memoize_on_first_arg
-def get_subkernel_to_insn_id_map(kernel: LoopKernel) -> Mapping[str, FrozenSet[str]]:
+def get_subkernel_to_insn_id_map(kernel: LoopKernel) -> Mapping[str, frozenset[str]]:
     """Return a :class:`dict` mapping subkernel names to a :class:`frozenset`
     consisting of the instruction ids scheduled within the subkernel. The
     kernel must be scheduled.
@@ -1837,7 +1842,7 @@ def get_subkernel_to_insn_id_map(kernel: LoopKernel) -> Mapping[str, FrozenSet[s
     from loopy.schedule import CallKernel, ReturnFromKernel, sched_item_to_insn_id
 
     subkernel = None
-    result: Dict[str, Set[str]] = {}
+    result: dict[str, set[str]] = {}
 
     for lin_item in kernel.linearization:
         if isinstance(lin_item, CallKernel):
@@ -1855,7 +1860,7 @@ def get_subkernel_to_insn_id_map(kernel: LoopKernel) -> Mapping[str, FrozenSet[s
 
 
 @memoize_on_first_arg
-def get_subkernel_extra_inames(kernel: LoopKernel) -> Mapping[str, FrozenSet[str]]:
+def get_subkernel_extra_inames(kernel: LoopKernel) -> Mapping[str, frozenset[str]]:
     from loopy.kernel import KernelState
     if kernel.state != KernelState.LINEARIZED:
         raise LoopyError("Kernel must be scheduled")
@@ -1863,7 +1868,7 @@ def get_subkernel_extra_inames(kernel: LoopKernel) -> Mapping[str, FrozenSet[str
     assert kernel.linearization is not None
 
     result = {}
-    inames: List[str] = []
+    inames: list[str] = []
 
     from loopy.schedule import CallKernel, EnterLoop, LeaveLoop
 
diff --git a/loopy/library/function.py b/loopy/library/function.py
index 7d274e492..8b61ad41a 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -20,14 +23,19 @@
 THE SOFTWARE.
 """
 
+from typing import TYPE_CHECKING
+
 import numpy as np
 
 from loopy.diagnostic import LoopyError
 from loopy.kernel.function_interface import ScalarCallable
-from loopy.translation_unit import CallablesTable
 from loopy.types import NumpyType
 
 
+if TYPE_CHECKING:
+    from loopy.translation_unit import CallablesTable
+
+
 class MakeTupleCallable(ScalarCallable):
     def with_types(self, arg_id_to_dtype, callables_table):
         new_arg_id_to_dtype = arg_id_to_dtype.copy()
diff --git a/loopy/library/random123.py b/loopy/library/random123.py
index 0afb0abb9..f65fa7600 100644
--- a/loopy/library/random123.py
+++ b/loopy/library/random123.py
@@ -1,4 +1,5 @@
 """Library integration with Random123."""
+from __future__ import annotations
 
 
 __copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
@@ -24,20 +25,35 @@
 """
 
 
+from dataclasses import dataclass, replace
+from typing import TYPE_CHECKING
+
 import numpy as np
 from mako.template import Template
 
-from pytools import ImmutableRecord
+from pymbolic.typing import not_none
 
 from loopy.kernel.function_interface import ScalarCallable
 
 
+if TYPE_CHECKING:
+    from loopy.target import TargetBase
+
+
 # {{{ rng metadata
 
-class RNGInfo(ImmutableRecord):
+@dataclass(frozen=True)
+class RNGInfo:
+    name: str
+    pyopencl_header: str
+    generic_header: str
+    key_width: int
+    width: int | None = None
+    bits: int | None = None
+
     @property
-    def full_name(self):
-        return "%s%dx%d" % (self.name, self.width, self.bits)
+    def full_name(self) -> str:
+        return "%s%dx%d" % (self.name, not_none(self.width), not_none(self.bits))
 
 
 _philox_base_info = RNGInfo(
@@ -53,15 +69,15 @@ def full_name(self):
             key_width=4)
 
 RNG_VARIANTS = [
-        _philox_base_info.copy(width=2, bits=32),
-        _philox_base_info.copy(width=2, bits=64),
-        _philox_base_info.copy(width=4, bits=32),
-        _philox_base_info.copy(width=4, bits=64),
-
-        _threefry_base_info.copy(width=2, bits=32),
-        _threefry_base_info.copy(width=2, bits=64),
-        _threefry_base_info.copy(width=4, bits=32),
-        _threefry_base_info.copy(width=4, bits=64),
+        replace(_philox_base_info, width=2, bits=32),
+        replace(_philox_base_info, width=2, bits=64),
+        replace(_philox_base_info, width=4, bits=32),
+        replace(_philox_base_info, width=4, bits=64),
+
+        replace(_threefry_base_info, width=2, bits=32),
+        replace(_threefry_base_info, width=2, bits=64),
+        replace(_threefry_base_info, width=4, bits=32),
+        replace(_threefry_base_info, width=4, bits=64),
         ]
 
 FUNC_NAMES_TO_RNG = {
@@ -165,12 +181,12 @@ def full_name(self):
 # }}}
 
 
+@dataclass(frozen=True, init=False)
 class Random123Callable(ScalarCallable):
     """
     Records information about for the random123 functions.
     """
-    fields = ScalarCallable.fields | {"target"}
-    hash_fields = ScalarCallable.hash_fields + ("target",)
+    target: TargetBase
 
     def __init__(self, name, arg_id_to_dtype=None,
                  arg_id_to_descr=None, name_in_target=None, target=None):
@@ -179,7 +195,7 @@ def __init__(self, name, arg_id_to_dtype=None,
                          arg_id_to_descr=arg_id_to_descr,
                          name_in_target=name_in_target)
 
-        self.target = target
+        object.__setattr__(self, "target", target)
 
     def with_types(self, arg_id_to_dtype, callables_table):
 
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 2d357d3b4..6ddc3fb86 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -21,11 +24,12 @@
 """
 
 
+from typing import TYPE_CHECKING
+
 import numpy as np
 
 from pymbolic import var
 from pymbolic.primitives import expr_dataclass
-from pytools.persistent_dict import Hash, KeyBuilder
 
 from loopy.diagnostic import LoopyError
 from loopy.kernel.function_interface import ScalarCallable
@@ -34,6 +38,10 @@
 from loopy.types import NumpyType
 
 
+if TYPE_CHECKING:
+    from pytools.persistent_dict import Hash, KeyBuilder
+
+
 __doc__ = """
 .. currentmodule:: loopy.library.reduction
 
@@ -329,7 +337,7 @@ def neutral_element(self, scalar_dtype, segment_flag_dtype,
         from loopy.library.function import MakeTupleCallable
         from loopy.translation_unit import add_callable_to_table
 
-        scalar_neutral_element, calables_table = (
+        scalar_neutral_element, _calables_table = (
                 self.inner_reduction.neutral_element(
                     scalar_dtype, callables_table, target))
 
@@ -347,8 +355,7 @@ def neutral_element(self, scalar_dtype, segment_flag_dtype,
                 segment_flag_dtype.numpy_dtype.type(0)), callables_table
 
     def result_dtypes(self, scalar_dtype, segment_flag_dtype):
-        return (self.inner_reduction.result_dtypes(scalar_dtype)
-                + (segment_flag_dtype,))
+        return ((*self.inner_reduction.result_dtypes(scalar_dtype), segment_flag_dtype))
 
     def __str__(self):
         return "segmented(%s)" % self.which
@@ -538,7 +545,7 @@ def register_reduction_parser(parser):
     _REDUCTION_OP_PARSERS.append(parser)
 
 
-def parse_reduction_op(name):
+def parse_reduction_op(name: str) -> ReductionOperation | None:
     import re
 
     red_op_match = re.match(r"^([a-z]+)_([a-z0-9_]+)$", name)
@@ -571,12 +578,12 @@ class ReductionCallable(ScalarCallable):
     def with_types(self, arg_id_to_dtype, callables_table):
         scalar_dtype = arg_id_to_dtype[0]
         index_dtype = arg_id_to_dtype[1]
-        result_dtypes = self.name.reduction_op.result_dtypes(scalar_dtype,
+        result_dtypes = self.name.reduction_op.result_dtypes(scalar_dtype,  # pylint: disable=no-member
                 index_dtype)
         new_arg_id_to_dtype = arg_id_to_dtype.copy()
         new_arg_id_to_dtype[-1] = result_dtypes[0]
         new_arg_id_to_dtype[-2] = result_dtypes[1]
-        name_in_target = self.name.reduction_op.prefix(scalar_dtype,
+        name_in_target = self.name.reduction_op.prefix(scalar_dtype,  # pylint: disable=no-member
                 index_dtype) + "_op"
 
         return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
@@ -594,7 +601,7 @@ def with_descrs(self, arg_id_to_descr, callables_table):
 class ArgExtOpCallable(ReductionCallable):
 
     def generate_preambles(self, target):
-        op = self.name.reduction_op
+        op = self.name.reduction_op  # pylint: disable=no-member
         scalar_dtype = self.arg_id_to_dtype[-1]
         index_dtype = self.arg_id_to_dtype[-2]
 
@@ -630,7 +637,7 @@ def generate_preambles(self, target):
 class SegmentOpCallable(ReductionCallable):
 
     def generate_preambles(self, target):
-        op = self.name.reduction_op
+        op = self.name.reduction_op  # pylint: disable=no-member
         scalar_dtype = self.arg_id_to_dtype[-1]
         segment_flag_dtype = self.arg_id_to_dtype[-2]
         prefix = op.prefix(scalar_dtype, segment_flag_dtype)
diff --git a/loopy/loop.py b/loopy/loop.py
index 001cd80a8..9903474c9 100644
--- a/loopy/loop.py
+++ b/loopy/loop.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
diff --git a/loopy/match.py b/loopy/match.py
index ae52e6c65..ef18799df 100644
--- a/loopy/match.py
+++ b/loopy/match.py
@@ -1,5 +1,33 @@
-"""Matching functionality for instruction ids and substitution
-rule invocations stacks."""
+"""
+.. autoclass:: Matchable
+.. autoclass:: StackMatchComponent
+.. autoclass:: StackMatch
+
+.. autofunction:: parse_match
+
+.. autofunction:: parse_stack_match
+
+.. autodata:: ToStackMatchConvertible
+
+Match expressions
+^^^^^^^^^^^^^^^^^
+
+.. autoclass:: MatchExpressionBase
+.. autoclass:: All
+.. autoclass:: And
+.. autoclass:: Or
+.. autoclass:: Not
+.. autoclass:: Id
+.. autoclass:: ObjTagged
+.. autoclass:: Tagged
+.. autoclass:: Writes
+.. autoclass:: Reads
+.. autoclass:: InKernel
+.. autoclass:: Iname
+
+"""
+
+from __future__ import annotations
 
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
@@ -28,43 +56,22 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from sys import intern
-from typing import FrozenSet, List, Protocol, Sequence, Tuple, Union
+from typing import TYPE_CHECKING, Protocol, Sequence, Union
+
+from typing_extensions import TypeAlias
 
-from loopy.kernel import LoopKernel
 from loopy.kernel.instruction import InstructionBase
 
 
 NoneType = type(None)
 
-import pytools.tag
 from pytools.lex import RE
 
 
-__doc__ = """
-.. autoclass:: Matchable
-.. autoclass:: StackMatchComponent
-.. autoclass:: StackMatch
-
-.. autofunction:: parse_match
-
-.. autofunction:: parse_stack_match
-
-Match expressions
-^^^^^^^^^^^^^^^^^
+if TYPE_CHECKING:
+    import pytools.tag
 
-.. autoclass:: MatchExpressionBase
-.. autoclass:: All
-.. autoclass:: And
-.. autoclass:: Or
-.. autoclass:: Not
-.. autoclass:: Id
-.. autoclass:: ObjTagged
-.. autoclass:: Tagged
-.. autoclass:: Writes
-.. autoclass:: Reads
-.. autoclass:: InKernel
-.. autoclass:: Iname
-"""
+    from loopy.kernel import LoopKernel
 
 
 def re_from_glob(s: str) -> re.Pattern:
@@ -133,7 +140,7 @@ class Matchable(Protocol):
     .. attribute:: tags
     """
     @property
-    def tags(self) -> FrozenSet[pytools.tag.Tag]:
+    def tags(self) -> frozenset[pytools.tag.Tag]:
         ...
 
 
@@ -494,7 +501,7 @@ def __call__(self, kernel: LoopKernel, stack: Sequence[Matchable]) -> bool:
 @dataclass(eq=True, frozen=True)
 class RuleInvocationMatchable:
     id: str
-    tags: FrozenSet[pytools.tag.Tag]
+    tags: frozenset[pytools.tag.Tag]
 
     def write_dependency_names(self):
         raise TypeError("writes: query may not be applied to rule invocations")
@@ -516,11 +523,11 @@ class StackMatch:
 
     def __call__(
             self, kernel: LoopKernel, insn: InstructionBase,
-            rule_stack: Sequence[Tuple[str, FrozenSet[pytools.tag.Tag]]]) -> bool:
+            rule_stack: Sequence[tuple[str, frozenset[pytools.tag.Tag]]]) -> bool:
         """
         :arg rule_stack: a tuple of (name, tags) rule invocation, outermost first
         """
-        stack_of_matchables: List[Matchable] = [insn]
+        stack_of_matchables: list[Matchable] = [insn]
         for id, tags in rule_stack:
             stack_of_matchables.append(RuleInvocationMatchable(id, tags))
 
@@ -531,10 +538,10 @@ def __call__(
 
 # {{{ stack match parsing
 
-ToStackMatchCovertible = Union[StackMatch, str, None]
+ToStackMatchConvertible: TypeAlias = Union[StackMatch, str, None]
 
 
-def parse_stack_match(smatch: ToStackMatchCovertible) -> StackMatch:
+def parse_stack_match(smatch: ToStackMatchConvertible) -> StackMatch:
     """Syntax example::
 
         ... > outer > ... > next > innermost $
diff --git a/loopy/options.py b/loopy/options.py
index 293670774..d9547c846 100644
--- a/loopy/options.py
+++ b/loopy/options.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
 
 __license__ = """
@@ -23,12 +26,16 @@
 
 import os
 import re
-from typing import Any
+from typing import TYPE_CHECKING, Any, ClassVar
 from warnings import warn
 
 from pytools import ImmutableRecord
 
 
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+
 ALLOW_TERMINAL_COLORS = True
 
 
@@ -198,7 +205,7 @@ class Options(ImmutableRecord):
         RAW, WAR and WAW races.
     """
 
-    _legacy_options_map = {
+    _legacy_options_map: ClassVar[Mapping[str, tuple[str, None] | None]] = {
             "cl_build_options": ("build_options", None),
             "write_cl": ("write_code", None),
             "highlight_cl": None,
@@ -220,7 +227,7 @@ def __init__(
         kwargs = _apply_legacy_map(self._legacy_options_map, kwargs)
 
         try:
-            import colorama  # noqa
+            import colorama  # noqa: F401
         except ImportError:
             allow_terminal_colors_def = False
         else:
@@ -332,7 +339,7 @@ def _style(self):
             return _ColoramaStub()
 
 
-KEY_VAL_RE = re.compile("^([a-zA-Z0-9]+)=(.*)$")
+KEY_VAL_RE = re.compile(r"^([a-zA-Z0-9]+)=(.*)$")
 
 
 def make_options(options_arg):
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 3293e9a1e..aee4044be 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -21,7 +24,7 @@
 """
 
 import logging
-from typing import FrozenSet, Iterable, List, Optional, Tuple, TypeVar, cast
+from typing import TYPE_CHECKING, Iterable, List, TypeVar, cast
 
 
 logger = logging.getLogger(__name__)
@@ -39,8 +42,6 @@
     WriteRaceConditionWarning,
     warn_with_kernel,
 )
-from loopy.kernel import LoopKernel
-from loopy.kernel.array import ArrayDimImplementationTag
 from loopy.kernel.data import (
     ArrayArg,
     KernelArgument,
@@ -68,7 +69,12 @@
 
 # for the benefit of loopy.statistics, for now
 from loopy.type_inference import infer_unknown_types
-from loopy.typing import ExpressionT
+
+
+if TYPE_CHECKING:
+    from loopy.kernel import LoopKernel
+    from loopy.kernel.array import ArrayDimImplementationTag
+    from loopy.typing import Expression
 
 
 # {{{ check for writes to predicates
@@ -135,8 +141,8 @@ def map_reduction(expr, rec):
 
 
 def _remove_at_indices(
-        indices: FrozenSet[int], values: Optional[Iterable[T]]
-        ) -> Optional[Tuple[T, ...]]:
+        indices: frozenset[int], values: Iterable[T] | None
+        ) -> tuple[T, ...] | None:
     """
     Assumes *indices* is sorted.
     """
@@ -174,14 +180,14 @@ def make_arrays_for_sep_arrays(kernel: LoopKernel) -> LoopKernel:
         sep_axis_indices_set = frozenset(sep_axis_indices)
 
         assert isinstance(arg.shape, tuple)
-        new_shape: Optional[Tuple[ExpressionT, ...]] = \
+        new_shape: tuple[Expression, ...] | None = \
                 _remove_at_indices(sep_axis_indices_set, arg.shape)
-        new_dim_tags: Optional[Tuple[ArrayDimImplementationTag, ...]] = \
+        new_dim_tags: tuple[ArrayDimImplementationTag, ...] | None = \
                 _remove_at_indices(sep_axis_indices_set, arg.dim_tags)
-        new_dim_names: Optional[Tuple[Optional[str], ...]] = \
+        new_dim_names: tuple[str | None, ...] | None = \
                 _remove_at_indices(sep_axis_indices_set, arg.dim_names)
 
-        sep_shape: List[ExpressionT] = [arg.shape[i] for i in sep_axis_indices]
+        sep_shape: list[Expression] = [arg.shape[i] for i in sep_axis_indices]
         for i, sep_shape_i in enumerate(sep_shape):
             if not isinstance(sep_shape_i, (int, np.integer)):
                 raise LoopyError(
@@ -193,7 +199,7 @@ def make_arrays_for_sep_arrays(kernel: LoopKernel) -> LoopKernel:
                 sep_axis_indices_set=sep_axis_indices_set,
                 subarray_names=Map({
                     ind: vng(f"{arg.name}_s{'_'.join(str(i) for i in ind)}")
-                    for ind in np.ndindex(*cast(List[int], sep_shape))}))
+                    for ind in np.ndindex(*cast("List[int]", sep_shape))}))
 
         new_args.append(arg.copy(_separation_info=sep_info))
 
@@ -220,11 +226,11 @@ def make_arrays_for_sep_arrays(kernel: LoopKernel) -> LoopKernel:
 # {{{ make temporary variables for offsets and strides
 
 def make_args_for_offsets_and_strides(kernel: LoopKernel) -> LoopKernel:
-    additional_args: List[KernelArgument] = []
+    additional_args: list[KernelArgument] = []
 
     vng = kernel.get_var_name_generator()
 
-    from pymbolic.primitives import Expression, Variable
+    from pymbolic.primitives import ExpressionNode, Variable
 
     from loopy.kernel.array import FixedStrideArrayDimTag
 
@@ -241,13 +247,13 @@ def make_args_for_offsets_and_strides(kernel: LoopKernel) -> LoopKernel:
                 additional_args.append(ValueArg(
                         offset_name, kernel.index_dtype))
                 arg = arg.copy(offset=offset_name)
-            elif isinstance(arg.offset, (int, np.integer, Expression, str)):
+            elif isinstance(arg.offset, (int, np.integer, ExpressionNode, str)):
                 pass
             else:
                 raise LoopyError(f"invalid value of {what}")
 
             if arg.dim_tags is None:
-                new_dim_tags: Optional[Tuple[ArrayDimImplementationTag, ...]]  \
+                new_dim_tags: tuple[ArrayDimImplementationTag, ...] | None \
                         = arg.dim_tags
             else:
                 new_dim_tags = ()
@@ -261,12 +267,12 @@ def make_args_for_offsets_and_strides(kernel: LoopKernel) -> LoopKernel:
                             additional_args.append(ValueArg(
                                     stride_name, kernel.index_dtype))
                         elif isinstance(
-                                dim_tag.stride, (int, np.integer, Expression)):
+                                dim_tag.stride, (int, np.integer, ExpressionNode)):
                             pass
                         else:
                             raise LoopyError(f"invalid value of {what}")
 
-                    new_dim_tags = new_dim_tags + (dim_tag,)
+                    new_dim_tags = (*new_dim_tags, dim_tag)
 
             arg = arg.copy(dim_tags=new_dim_tags)
 
@@ -286,7 +292,7 @@ def make_args_for_offsets_and_strides(kernel: LoopKernel) -> LoopKernel:
 
 def zero_offsets_and_strides(kernel: LoopKernel) -> LoopKernel:
     made_changes = False
-    from pymbolic.primitives import Expression
+    from pymbolic.primitives import ExpressionNode
 
     # {{{ process arguments
 
@@ -298,7 +304,7 @@ def zero_offsets_and_strides(kernel: LoopKernel) -> LoopKernel:
             if arg.offset is auto:
                 made_changes = True
                 arg = arg.copy(offset=0)
-            elif isinstance(arg.offset, (int, np.integer, Expression, str)):
+            elif isinstance(arg.offset, (int, np.integer, ExpressionNode, str)):
                 from pymbolic.primitives import is_zero
                 if not is_zero(arg.offset):
                     raise LoopyError(
@@ -499,7 +505,7 @@ def check_atomic_loads(kernel):
                 for x in missed:
                     if {x} & atomicity_candidates:
                         insn = insn.copy(
-                            atomicity=insn.atomicity + (AtomicLoad(x),))
+                            atomicity=(*insn.atomicity, AtomicLoad(x)))
 
         new_insns.append(insn)
 
@@ -697,7 +703,7 @@ def _tuple_or_none(s):
                 raise NotImplementedError()
         new_callable, clbl_inf_ctx = t_unit.callables_table[e].with_descrs(
                 arg_id_to_descr, clbl_inf_ctx)
-        clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable,
+        clbl_inf_ctx, _new_name = clbl_inf_ctx.with_callable(e, new_callable,
                                                             is_entrypoint=True)
 
     return clbl_inf_ctx.finish_program(t_unit)
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 2460f5ed7..55e0a197b 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -27,7 +27,6 @@
 
 import logging
 import sys
-from collections.abc import Hashable, Iterator, Mapping, Sequence, Set
 from dataclasses import dataclass, replace
 from typing import (
     TYPE_CHECKING,
@@ -42,19 +41,22 @@
 from pytools.persistent_dict import WriteOncePersistentDict
 
 from loopy.diagnostic import LoopyError, ScheduleDebugInputError, warn_with_kernel
-from loopy.kernel.instruction import InstructionBase
 from loopy.tools import LoopyKeyBuilder, caches
 from loopy.typing import InameStr
 from loopy.version import DATA_MODEL_VERSION
 
 
 if TYPE_CHECKING:
+    from collections.abc import Hashable, Iterator, Mapping, Sequence, Set
+
     from loopy.kernel import LoopKernel
+    from loopy.kernel.function_interface import InKernelCallable
+    from loopy.kernel.instruction import InstructionBase
     from loopy.schedule.tools import (
         InameStrSet,
         LoopTree,
     )
-    from loopy.translation_unit import CallablesTable, TranslationUnit
+    from loopy.translation_unit import CallablesTable, FunctionIdT, TranslationUnit
 
 
 logger = logging.getLogger(__name__)
@@ -1020,7 +1022,7 @@ def _generate_loop_schedules_v2(kernel: LoopKernel) -> Sequence[ScheduleItem]:
     def iname_key(iname: str) -> str:
         all_ancestors = sorted(loop_tree.ancestors(iname),
                                key=lambda x: loop_tree.depth(x))
-        return ",".join(all_ancestors+[iname])
+        return ",".join([*all_ancestors, iname])
 
     def key(x: ScheduleItem) -> tuple[str, ...]:
         if isinstance(x, RunInstruction):
@@ -1097,7 +1099,7 @@ def _generate_loop_schedules_internal(
         assert sched_state.within_subkernel is False
         yield from _generate_loop_schedules_internal(
                 sched_state.copy(
-                    schedule=sched_state.schedule + (next_preschedule_item,),
+                    schedule=(*sched_state.schedule, next_preschedule_item),
                     preschedule=sched_state.preschedule[1:],
                     within_subkernel=True,
                     may_schedule_global_barriers=False,
@@ -1110,7 +1112,7 @@ def _generate_loop_schedules_internal(
         if sched_state.active_inames == sched_state.enclosing_subkernel_inames:
             yield from _generate_loop_schedules_internal(
                     sched_state.copy(
-                        schedule=sched_state.schedule + (next_preschedule_item,),
+                        schedule=(*sched_state.schedule, next_preschedule_item),
                         preschedule=sched_state.preschedule[1:],
                         within_subkernel=False,
                         may_schedule_global_barriers=True),
@@ -1129,7 +1131,7 @@ def _generate_loop_schedules_internal(
             and next_preschedule_item.originating_insn_id is None):
         yield from _generate_loop_schedules_internal(
                     sched_state.copy(
-                        schedule=sched_state.schedule + (next_preschedule_item,),
+                        schedule=(*sched_state.schedule, next_preschedule_item),
                         preschedule=sched_state.preschedule[1:]),
                     debug=debug)
 
@@ -1289,7 +1291,7 @@ def insn_sort_key(insn_id):
                     unscheduled_insn_ids=sched_state.unscheduled_insn_ids - iid_set,
                     insn_ids_to_try=new_insn_ids_to_try,
                     schedule=(
-                        sched_state.schedule + (RunInstruction(insn_id=insn.id),)),
+                        (*sched_state.schedule, RunInstruction(insn_id=insn.id))),
                     preschedule=(
                         sched_state.preschedule
                         if insn_id not in sched_state.prescheduled_insn_ids
@@ -1403,8 +1405,8 @@ def insn_sort_key(insn_id):
                 for sub_sched in _generate_loop_schedules_internal(
                         sched_state.copy(
                             schedule=(
-                                sched_state.schedule
-                                + (LeaveLoop(iname=last_entered_loop),)),
+                                (*sched_state.schedule,
+                                    LeaveLoop(iname=last_entered_loop))),
                             active_inames=sched_state.active_inames[:-1],
                             insn_ids_to_try=insn_ids_to_try,
                             preschedule=(
@@ -1613,10 +1615,9 @@ def insn_sort_key(insn_id):
                     for sub_sched in _generate_loop_schedules_internal(
                             sched_state.copy(
                                 schedule=(
-                                    sched_state.schedule
-                                    + (EnterLoop(iname=iname),)),
+                                    (*sched_state.schedule, EnterLoop(iname=iname))),
                                 active_inames=(
-                                    sched_state.active_inames + (iname,)),
+                                    (*sched_state.active_inames, iname)),
                                 entered_inames=(
                                     sched_state.entered_inames
                                     | frozenset((iname,))),
@@ -2446,7 +2447,7 @@ def get_one_linearized_kernel(
                         callables_table)
 
     if CACHING_ENABLED and not from_cache:
-        schedule_cache.store_if_not_present(sched_cache_key, result)  # pylint: disable=possibly-used-before-assignment  # noqa: E501
+        schedule_cache.store_if_not_present(sched_cache_key, result)  # pylint: disable=possibly-used-before-assignment
 
     return result
 
@@ -2466,7 +2467,7 @@ def linearize(t_unit: TranslationUnit) -> TranslationUnit:
 
     pre_schedule_checks(t_unit)
 
-    new_callables = {}
+    new_callables: dict[FunctionIdT, InKernelCallable] = {}
 
     for name, clbl in t_unit.callables_table.items():
         if isinstance(clbl, CallableKernel):
diff --git a/loopy/schedule/device_mapping.py b/loopy/schedule/device_mapping.py
index a0345049d..226757dea 100644
--- a/loopy/schedule/device_mapping.py
+++ b/loopy/schedule/device_mapping.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2016 Matt Wala"
 
 __license__ = """
@@ -45,9 +48,9 @@ def map_schedule_onto_host_or_device(kernel):
 
     if not kernel.target.split_kernel_at_global_barriers():
         new_schedule = (
-            [CallKernel(kernel_name=device_prog_name_gen())] +
-            list(kernel.linearization) +
-            [ReturnFromKernel(kernel_name=kernel.name)])
+            [CallKernel(kernel_name=device_prog_name_gen()),
+                *kernel.linearization,
+                ReturnFromKernel(kernel_name=kernel.name)])
         kernel = kernel.copy(linearization=new_schedule)
     else:
         kernel = map_schedule_onto_host_or_device_impl(
@@ -92,19 +95,13 @@ def inner_mapper(start_idx, end_idx, new_schedule):
                     schedule_required_splitting = True
                     if current_chunk:
                         new_schedule.extend(
-                            [dummy_call.copy()] +
-                            current_chunk +
-                            [dummy_return.copy()])
+                            [dummy_call.copy(), *current_chunk, dummy_return.copy()])
                     new_schedule.extend(
-                        [start_item] +
-                        inner_schedule +
-                        [end_item])
+                        [start_item, *inner_schedule, end_item])
                     current_chunk = []
                 else:
                     current_chunk.extend(
-                        [start_item] +
-                        inner_schedule +
-                        [end_item])
+                        [start_item, *inner_schedule, end_item])
 
             elif isinstance(sched_item, Barrier):
                 if sched_item.synchronization_kind == "global":
@@ -112,9 +109,7 @@ def inner_mapper(start_idx, end_idx, new_schedule):
                     schedule_required_splitting = True
                     if current_chunk:
                         new_schedule.extend(
-                            [dummy_call.copy()] +
-                            current_chunk +
-                            [dummy_return.copy()])
+                            [dummy_call.copy(), *current_chunk, dummy_return.copy()])
                     new_schedule.append(sched_item)
                     current_chunk = []
                 else:
@@ -127,9 +122,7 @@ def inner_mapper(start_idx, end_idx, new_schedule):
         if current_chunk and schedule_required_splitting:
             # Wrap remainder of schedule into a kernel call.
             new_schedule.extend(
-                [dummy_call.copy()] +
-                current_chunk +
-                [dummy_return.copy()])
+                [dummy_call.copy(), *current_chunk, dummy_return.copy()])
         else:
             new_schedule.extend(current_chunk)
 
@@ -142,9 +135,7 @@ def inner_mapper(start_idx, end_idx, new_schedule):
     if not split_kernel:
         # Wrap everything into a kernel call.
         new_schedule = (
-            [dummy_call.copy()] +
-            new_schedule +
-            [dummy_return.copy()])
+            [dummy_call.copy(), *new_schedule, dummy_return.copy()])
 
     # Assign names to CallKernel / ReturnFromKernel instructions
 
diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py
index 3858462b1..b659ee7b7 100644
--- a/loopy/schedule/tools.py
+++ b/loopy/schedule/tools.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = """
 Copyright (C) 2016 Matt Wala
 Copyright (C) 2020 University of Illinois Board of Trustees
@@ -22,13 +25,19 @@
 .. autoclass:: AccessMapDescriptor
 .. autoclass:: WriteRaceChecker
 
-.. autoclass:: InameStrSet
 .. autoclass:: LoopNestTree
 .. autoclass:: LoopTree
 
 .. autofunction:: separate_loop_nest
 .. autofunction:: get_partial_loop_nest_tree
 .. autofunction:: get_loop_tree
+
+References
+^^^^^^^^^^
+
+.. class:: InameStrSet
+
+    See :class:`loopy.typing.InameStrSet`
 """
 
 __license__ = """
@@ -52,10 +61,9 @@
 """
 
 import enum
-from collections.abc import Callable, Collection, Mapping
 from dataclasses import dataclass
 from functools import cached_property, reduce
-from typing import AbstractSet, Dict, FrozenSet, List, Sequence, Set, Tuple
+from typing import TYPE_CHECKING, AbstractSet, Sequence
 
 from immutables import Map
 from typing_extensions import TypeAlias
@@ -64,11 +72,16 @@
 from pytools import memoize_method, memoize_on_first_arg
 
 from loopy.diagnostic import LoopyError
-from loopy.kernel import LoopKernel
 from loopy.kernel.data import AddressSpace, ArrayArg, TemporaryVariable
-from loopy.schedule import ScheduleItem
 from loopy.schedule.tree import Tree
-from loopy.typing import InameStr, not_none
+from loopy.typing import InameStr, InameStrSet, not_none
+
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Collection, Mapping
+
+    from loopy.kernel import LoopKernel
+    from loopy.schedule import ScheduleItem
 
 
 # {{{ block boundary finder
@@ -97,7 +110,7 @@ def get_block_boundaries(schedule: Sequence[ScheduleItem]) -> Mapping[int, int]:
 # {{{ subkernel tools
 
 def temporaries_read_in_subkernel(
-        kernel: LoopKernel, subkernel_name: str) -> FrozenSet[str]:
+        kernel: LoopKernel, subkernel_name: str) -> frozenset[str]:
     from loopy.kernel.tools import get_subkernel_to_insn_id_map
     insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel_name]
     inames = frozenset().union(*(kernel.insn_inames(insn_id)
@@ -115,7 +128,7 @@ def temporaries_read_in_subkernel(
 
 
 def temporaries_written_in_subkernel(
-        kernel: LoopKernel, subkernel_name: str) -> FrozenSet[str]:
+        kernel: LoopKernel, subkernel_name: str) -> frozenset[str]:
     from loopy.kernel.tools import get_subkernel_to_insn_id_map
     insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel_name]
     return frozenset(tv
@@ -125,7 +138,7 @@ def temporaries_written_in_subkernel(
 
 
 def args_read_in_subkernel(
-        kernel: LoopKernel, subkernel_name: str) -> FrozenSet[str]:
+        kernel: LoopKernel, subkernel_name: str) -> frozenset[str]:
     from loopy.kernel.tools import get_subkernel_to_insn_id_map
     insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel_name]
     inames = frozenset().union(*(kernel.insn_inames(insn_id)
@@ -142,7 +155,7 @@ def args_read_in_subkernel(
 
 
 def args_written_in_subkernel(
-        kernel: LoopKernel, subkernel_name: str) -> FrozenSet[str]:
+        kernel: LoopKernel, subkernel_name: str) -> frozenset[str]:
     from loopy.kernel.tools import get_subkernel_to_insn_id_map
     insn_ids = get_subkernel_to_insn_id_map(kernel)[subkernel_name]
     return frozenset(arg
@@ -152,8 +165,8 @@ def args_written_in_subkernel(
 
 
 def supporting_temporary_names(
-        kernel: LoopKernel, tv_names: FrozenSet[str]) -> FrozenSet[str]:
-    result: Set[str] = set()
+        kernel: LoopKernel, tv_names: frozenset[str]) -> frozenset[str]:
+    result: set[str] = set()
 
     for name in tv_names:
         tv = kernel.temporary_variables[name]
@@ -176,7 +189,7 @@ class KernelArgInfo:
     """
 
     passed_arg_names: Sequence[str]
-    written_names: FrozenSet[str]
+    written_names: frozenset[str]
 
     @property
     def passed_names(self) -> Sequence[str]:
@@ -208,7 +221,7 @@ def _should_temp_var_be_passed(tv: TemporaryVariable) -> bool:
 class _SupportingNameTracker:
     def __init__(self, kernel: LoopKernel):
         self.kernel = kernel
-        self.name_to_main_name: Dict[str, str] = {}
+        self.name_to_main_name: dict[str, str] = {}
 
     def add_supporting_names_for(self, name):
         var_descr = self.kernel.get_var_descriptor(name)
@@ -218,8 +231,8 @@ def add_supporting_names_for(self, name):
                     | {name})
 
     def get_additional_args_and_tvs(
-            self, already_passed: Set[str]
-            ) -> Tuple[List[str], List[str]]:
+            self, already_passed: set[str]
+            ) -> tuple[list[str], list[str]]:
         additional_args = []
         additional_temporaries = []
 
@@ -237,11 +250,11 @@ def get_additional_args_and_tvs(
 
 
 def _process_args_for_arg_info(
-        kernel: LoopKernel, args_read: Set[str], args_written: Set[str],
+        kernel: LoopKernel, args_read: set[str], args_written: set[str],
         supp_name_tracker: _SupportingNameTracker, used_only: bool,
-        ) -> List[str]:
+        ) -> list[str]:
 
-    args_expected: Set[str] = set()
+    args_expected: set[str] = set()
 
     passed_arg_names = []
     for arg in kernel.args:
@@ -319,7 +332,7 @@ def get_subkernel_arg_info(
             supp_name_tracker=supp_name_tracker,
             used_only=True)
 
-    passed_temporaries: List[str] = []
+    passed_temporaries: list[str] = []
     for tv_name in sorted(tvs_read | tvs_written):
         supp_name_tracker.add_supporting_names_for(tv_name)
         tv = kernel.temporary_variables[tv_name]
@@ -671,7 +684,6 @@ def do_accesses_result_in_races(self, insn1, insn1_dir, insn2, insn2_dir,
 # }}}
 
 
-InameStrSet: TypeAlias = FrozenSet[InameStr]
 LoopNestTree: TypeAlias = Tree[InameStrSet]
 LoopTree: TypeAlias = Tree[InameStr]
 
@@ -783,8 +795,8 @@ def _add_inner_loops(tree, outer_loop_nest, inner_loop_nest):
 
 def _order_loop_nests(
             loop_nest_tree: LoopNestTree,
-            strict_priorities: FrozenSet[Tuple[InameStr, ...]],
-            relaxed_priorities: FrozenSet[Tuple[InameStr, ...]],
+            strict_priorities: frozenset[tuple[InameStr, ...]],
+            relaxed_priorities: frozenset[tuple[InameStr, ...]],
             iname_to_tree_node_id: Mapping[InameStr, InameStrSet],
           ) -> LoopTree:
     """
@@ -825,7 +837,7 @@ def _order_loop_nests(
     # toposort for each loop nest.
 
     def _update_nesting_constraints(
-                priorities: FrozenSet[Tuple[InameStr, ...]],
+                priorities: frozenset[tuple[InameStr, ...]],
                 cannot_satisfy_callback: Callable[[str], None]
             ) -> None:
         """
@@ -862,10 +874,12 @@ def _update_nesting_constraints(
                                                 .ancestors(inner_iname_nest))
                     ancestors_of_outer_iname = (loop_nest_tree
                                                 .ancestors(outer_iname_nest))
-                    if outer_iname in ancestors_of_inner_iname:
+                    if any(outer_iname in ancestor
+                           for ancestor in ancestors_of_inner_iname):
                         # nesting constraint already satisfied => do nothing
                         pass
-                    elif inner_iname in ancestors_of_outer_iname:
+                    elif any(inner_iname in ancestor
+                             for ancestor in ancestors_of_outer_iname):
                         cannot_satisfy_callback("Cannot satisfy constraint that"
                                                 f" iname '{inner_iname}' must be"
                                                 f" nested within '{outer_iname}''.")
@@ -968,7 +982,7 @@ def get_partial_loop_nest_tree(kernel: LoopKernel) -> LoopNestTree:
     tree = Tree.from_root(root)
 
     # mapping from iname to the innermost loop nest they are part of in *tree*.
-    iname_to_tree_node_id: Dict[InameStr, InameStrSet] = {}
+    iname_to_tree_node_id: dict[InameStr, InameStrSet] = {}
 
     # if there were any loop with no inames, those have been already account
     # for as the root.
@@ -1067,7 +1081,7 @@ def get_loop_tree(kernel: LoopKernel) -> LoopTree:
     iname_to_tree_node_id = (
         _get_iname_to_tree_node_id_from_partial_loop_nest_tree(tree))
 
-    strict_loop_priorities: FrozenSet[Tuple[InameStr, ...]] = frozenset()
+    strict_loop_priorities: frozenset[tuple[InameStr, ...]] = frozenset()
 
     # {{{ impose constraints by the domain tree
 
diff --git a/loopy/schedule/tree.py b/loopy/schedule/tree.py
index e98724f83..3861aa75c 100644
--- a/loopy/schedule/tree.py
+++ b/loopy/schedule/tree.py
@@ -34,9 +34,10 @@
 THE SOFTWARE.
 """
 
+import operator
 from collections.abc import Hashable, Iterator, Sequence
 from dataclasses import dataclass
-from functools import cached_property
+from functools import cached_property, reduce
 from typing import Generic, TypeVar
 
 from immutables import Map
@@ -49,7 +50,9 @@
 NodeT = TypeVar("NodeT", bound=Hashable)
 
 
-@dataclass(frozen=True)
+# Not frozen when optimizations are enabled because it is slower.
+# Tree objects are immutable, and offer no way to mutate the tree.
+@dataclass(frozen=__debug__)  # type: ignore[literal-required]
 class Tree(Generic[NodeT]):
     """
     An immutable tree containing nodes of type :class:`NodeT`.
@@ -94,31 +97,23 @@ def ancestors(self, node: NodeT) -> tuple[NodeT, ...]:
         """
         Returns a :class:`tuple` of nodes that are ancestors of *node*.
         """
-        assert node in self
-
-        if self.is_root(node):
+        parent = self.parent(node)
+        if parent is None:
             # => root
             return ()
 
-        parent = self._child_to_parent[node]
-        assert parent is not None
-
-        return (parent,) + self.ancestors(parent)
+        return (parent, *self.ancestors(parent))
 
     def parent(self, node: NodeT) -> NodeT | None:
         """
         Returns the parent of *node*.
         """
-        assert node in self
-
         return self._child_to_parent[node]
 
     def children(self, node: NodeT) -> tuple[NodeT, ...]:
         """
         Returns the children of *node*.
         """
-        assert node in self
-
         return self._parent_to_children[node]
 
     @memoize_method
@@ -126,25 +121,18 @@ def depth(self, node: NodeT) -> int:
         """
         Returns the depth of *node*, with the root having depth 0.
         """
-        assert node in self
-
-        if self.is_root(node):
-            # => None
-            return 0
-
         parent_of_node = self.parent(node)
-        assert parent_of_node is not None
+        if parent_of_node is None:
+            return 0
 
         return 1 + self.depth(parent_of_node)
 
     def is_root(self, node: NodeT) -> bool:
-        assert node in self
-
+        """Return *True* if *node* is the root of the tree."""
         return self.parent(node) is None
 
     def is_leaf(self, node: NodeT) -> bool:
-        assert node in self
-
+        """Return *True* if *node* has no children."""
         return len(self.children(node)) == 0
 
     def __contains__(self, node: NodeT) -> bool:
@@ -161,9 +149,11 @@ def add_node(self, node: NodeT, parent: NodeT) -> Tree[NodeT]:
 
         siblings = self._parent_to_children[parent]
 
-        return Tree((self._parent_to_children
-                     .set(parent, siblings + (node,))
-                     .set(node, ())),
+        parent_to_children_mut = self._parent_to_children.mutate()
+        parent_to_children_mut[parent] = (*siblings, node)
+        parent_to_children_mut[node] = ()
+
+        return Tree(parent_to_children_mut.finish(),
                     self._child_to_parent.set(node, parent))
 
     def replace_node(self, node: NodeT, new_node: NodeT) -> Tree[NodeT]:
@@ -231,15 +221,14 @@ def move_node(self, node: NodeT, new_parent: NodeT | None) -> Tree[NodeT]:
         assert parent is not None  # parent=root handled as a special case
         siblings = self.children(parent)
         parents_new_children = tuple(frozenset(siblings) - frozenset([node]))
-        new_parents_children = self.children(new_parent) + (node,)
+        new_parents_children = (*self.children(new_parent), node)
 
-        new_child_to_parent = self._child_to_parent.set(node, new_parent)
-        new_parent_to_children = (self._parent_to_children
-                                  .set(parent, parents_new_children)
-                                  .set(new_parent, new_parents_children))
+        parent_to_children_mut = self._parent_to_children.mutate()
+        parent_to_children_mut[parent] = parents_new_children
+        parent_to_children_mut[new_parent] = new_parents_children
 
-        return Tree(new_parent_to_children,
-                    new_child_to_parent)
+        return Tree(parent_to_children_mut.finish(),
+                    self._child_to_parent.set(node, new_parent))
 
     def __str__(self) -> str:
         """
@@ -276,7 +265,7 @@ def post_process_last_child(children: Sequence[str]) -> list[str]:
                                 for c in children_result[:-1]]
                             + [post_process_last_child(c)
                                 for c in children_result[-1:]])
-            return [str(node)] + sum(children_result, start=[])
+            return [str(node), *reduce(operator.iadd, children_result, [])]
 
         return "\n".join(rec(self.root))
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 5284dda2a..c781d5780 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -31,20 +31,9 @@
 from dataclasses import dataclass, replace
 from enum import Enum, auto as enum_auto
 from functools import cached_property, partial
-from typing import (
-    Any,
-    Callable,
-    Generic,
-    Iterable,
-    Mapping,
-    Sequence,
-    Type,
-    TypeVar,
-    Union,
-    cast,
-)
 
 from immutabledict import immutabledict
+from typing import TYPE_CHECKING, ClassVar
 
 import islpy as isl
 import pymbolic.primitives as p
@@ -73,6 +62,10 @@
 from loopy.typing import Expression, ExpressionT, auto
 
 
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
 __doc__ = """
 
 .. currentmodule:: loopy
@@ -443,7 +436,7 @@ def group_by(self, *args) -> ToCountMap[CountT]:
 
         # make sure all item keys have same type
         if self.count_map:
-            key_type = type(list(self.keys())[0])
+            key_type = type(next(iter(self.keys())))
             if not all(isinstance(x, key_type) for x in self.keys()):
                 raise ValueError("ToCountMap: group_by() function may only "
                                  "be used on ToCountMaps with uniform keys")
@@ -651,9 +644,10 @@ class CountGranularity(Enum):
 
     """
 
-    WORKITEM = 0
-    SUBGROUP = 1
-    WORKGROUP = 2
+    WORKITEM = "workitem"
+    SUBGROUP = "subgroup"
+    WORKGROUP = "workgroup"
+    ALL: ClassVar[Sequence[str]] = [WORKITEM, SUBGROUP, WORKGROUP]
 
 # }}}
 
@@ -716,7 +710,6 @@ class Op:
     .. attribute:: tags
 
         A :class:`frozenset` of tags to the operation.
-
     """
     dtype: LoopyType | None = None
     op_type: OpType | None = None
@@ -776,7 +769,7 @@ class MemAccess:
     .. attribute:: lid_strides
 
        A :class:`dict` of **{** :class:`int` **:**
-       :class:`pymbolic.primitives.Expression` or :class:`int` **}** that
+       :data:`~pymbolic.typing.Expression` or :class:`int` **}** that
        specifies local strides for each local id in the memory access index.
        Local ids not found will not be present in ``lid_strides.keys()``.
        Uniform access (i.e. work-items within a sub-group access the same
@@ -787,7 +780,7 @@ class MemAccess:
     .. attribute:: gid_strides
 
        A :class:`dict` of **{** :class:`int` **:**
-       :class:`pymbolic.primitives.Expression` or :class:`int` **}** that
+       :data:`~pymbolic.typing.Expression` or :class:`int` **}** that
        specifies global strides for each global id in the memory access index.
        global ids not found will not be present in ``gid_strides.keys()``.
 
@@ -1045,7 +1038,6 @@ def map_common_subexpression(
         raise RuntimeError("%s encountered %s--not supposed to happen"
                 % (type(self).__name__, type(expr).__name__))
 
-    map_substitution = map_common_subexpression
     map_derivative = map_common_subexpression
     map_slice = map_common_subexpression
 
@@ -1227,11 +1219,6 @@ def map_common_subexpression(self, expr, tags):
                                   "common_subexpression, "
                                   "map_common_subexpression not implemented.")
 
-    def map_substitution(self, expr, tags):
-        raise NotImplementedError("ExpressionOpCounter encountered "
-                                  "substitution, "
-                                  "map_substitution not implemented.")
-
     def map_derivative(self, expr, tags):
         raise NotImplementedError("ExpressionOpCounter encountered "
                                   "derivative, "
@@ -1944,7 +1931,7 @@ def get_op_map(
         if len(t_unit.entrypoints) > 1:
             raise LoopyError("Must provide entrypoint")
 
-        entrypoint = list(t_unit.entrypoints)[0]
+        entrypoint = next(iter(program.entrypoints))
 
     assert entrypoint in t_unit.entrypoints
 
@@ -2175,7 +2162,7 @@ def get_mem_access_map(
         if len(t_unit.entrypoints) > 1:
             raise LoopyError("Must provide entrypoint")
 
-        entrypoint = list(t_unit.entrypoints)[0]
+        entrypoint = next(iter(program.entrypoints))
 
     assert entrypoint in t_unit.entrypoints
 
@@ -2308,7 +2295,7 @@ def get_synchronization_map(
         if len(t_unit.entrypoints) > 1:
             raise LoopyError("Must provide entrypoint")
 
-        entrypoint = list(t_unit.entrypoints)[0]
+        entrypoint = next(iter(program.entrypoints))
 
     assert entrypoint in t_unit.entrypoints
     from loopy.preprocess import infer_unknown_types, preprocess_program
@@ -2373,7 +2360,7 @@ def gather_access_footprints(
         if len(t_unit.entrypoints) > 1:
             raise LoopyError("Must provide entrypoint")
 
-        entrypoint = list(t_unit.entrypoints)[0]
+        entrypoint = next(iter(program.entrypoints))
 
     assert entrypoint in t_unit.entrypoints
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index c595e8392..ff42749f8 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -24,30 +24,34 @@
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
-
-
 import re
+from dataclasses import dataclass, replace
 from functools import cached_property, reduce
 from sys import intern
 from typing import (
     TYPE_CHECKING,
     AbstractSet,
     Any,
+    ClassVar,
+    Concatenate,
+    Generic,
     Mapping,
     Sequence,
+    TypeAlias,
     cast,
 )
 from warnings import warn
 
 import immutables
 import numpy as np
+from typing_extensions import Self
 
 import islpy as isl
 import pymbolic.primitives  # FIXME: also import by full name to allow sphinx to resolve
 import pymbolic.primitives as p
 import pytools.lex
 from islpy import dim_type
-from pymbolic import ArithmeticExpressionT, Variable
+from pymbolic import ArithmeticExpression, Variable
 from pymbolic.mapper import (
     CachedCombineMapper as CombineMapperBase,
     CachedIdentityMapper as IdentityMapperBase,
@@ -56,13 +60,17 @@
     IdentityMapper as UncachedIdentityMapperBase,
     Mapper,
     P,
+    ResultT,
     WalkMapper as UncachedWalkMapperBase,
 )
 from pymbolic.mapper.coefficient import CoefficientCollector as CoefficientCollectorBase
 from pymbolic.mapper.constant_folder import (
     ConstantFoldingMapper as ConstantFoldingMapperBase,
 )
-from pymbolic.mapper.dependency import CachedDependencyMapper as DependencyMapperBase
+from pymbolic.mapper.dependency import (
+    CachedDependencyMapper as DependencyMapperBase,
+    DependenciesT,
+)
 from pymbolic.mapper.evaluator import CachedEvaluationMapper as EvaluationMapperBase
 from pymbolic.mapper.flattener import FlattenMapper as FlattenMapperBase
 from pymbolic.mapper.stringifier import StringifyMapper as StringifyMapperBase
@@ -71,8 +79,7 @@
 )
 from pymbolic.mapper.unifier import UnidirectionalUnifier as UnidirectionalUnifierBase
 from pymbolic.parser import Parser as ParserBase
-from pymbolic.typing import ArithmeticOrExpressionT
-from pytools import ImmutableRecord, memoize, memoize_method, memoize_on_first_arg
+from pytools import memoize, memoize_method, memoize_on_first_arg
 from pytools.tag import Tag, Taggable, ToTagSetConvertible
 
 from loopy.diagnostic import (
@@ -80,12 +87,19 @@
     LoopyError,
     UnableToDetermineAccessRangeError,
 )
-from loopy.types import LoopyType, NumpyType, ToLoopyTypeConvertible
-from loopy.typing import ExpressionT, auto
+from loopy.typing import Expression, not_none
 
 
 if TYPE_CHECKING:
+    from collections.abc import Callable, Collection, Iterable
+
+    from pymbolic.typing import ArithmeticOrExpressionT
+
+    from loopy.kernel import LoopKernel
+    from loopy.kernel.data import KernelArgument, SubstitutionRule, TemporaryVariable
+    from loopy.kernel.instruction import InstructionBase
     from loopy.library.reduction import ReductionOperation, ReductionOpFunction
+    from loopy.types import LoopyType, NumpyType, ToLoopyTypeConvertible
 
 
 __doc__ = """
@@ -107,12 +121,18 @@
 .. autoclass:: LinearSubscript
 
 .. currentmodule:: loopy.symbolic
+.. autoclass:: SubArrayRef
+
 
 .. autoclass:: RuleArgument
+.. autoclass:: ResolvedFunction
+
+Rule-aware Mappers
+^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: SubstitutionRuleMappingContext
 .. autoclass:: ExpansionState
 .. autoclass:: RuleAwareIdentityMapper
-.. autoclass:: ResolvedFunction
-.. autoclass:: SubArrayRef
 
 
 Expression Manipulation Helpers
@@ -126,6 +146,14 @@
 .. class:: Variable
 
     See :class:`pymbolic.Variable`.
+
+.. class:: Expression
+
+    See :data:`pymbolic.typing.Expression`.
+
+.. class:: _Expression
+
+    See :class:`pymbolic.primitives.ExpressionNode`.
 """
 
 
@@ -136,23 +164,28 @@ def map_tagged_expression(self, expr: TaggedExpression, *args, **kwargs):
         new_expr = self.rec(expr.expr, *args, **kwargs)
         return TaggedExpression(expr.tags, new_expr)
 
-    def map_literal(self, expr: Literal, *args, **kwargs):
+    def map_literal(self,
+                    expr: Literal, *args: P.args, **kwargs: P.kwargs) -> Expression:
         return expr
 
-    def map_array_literal(self, expr: ArrayLiteral, *args, **kwargs):
+    def map_array_literal(
+                self,
+                expr: ArrayLiteral, *args: P.args, **kwargs: P.kwargs
+            ) -> Expression:
         return type(expr)(tuple(self.rec(ch, *args, **kwargs)
                                 for ch in expr.children))
 
-    def map_group_hw_index(self, expr, *args, **kwargs):
+    def map_group_hw_index(self, expr, *args: P.args, **kwargs: P.kwargs) -> Expression:
         return expr
 
-    def map_local_hw_index(self, expr, *args, **kwargs):
+    def map_local_hw_index(self, expr, *args: P.args, **kwargs: P.kwargs) -> Expression:
         return expr
 
-    def map_loopy_function_identifier(self, expr, *args, **kwargs):
+    def map_loopy_function_identifier(self, expr, *args: P.args, **kwargs: P.kwargs):
         return expr
 
-    def map_reduction(self, expr, *args, **kwargs):
+    def map_reduction(self,
+                      expr: Reduction, *args: P.args, **kwargs: P.kwargs) -> Expression:
         mapped_inames = [self.rec(p.Variable(iname), *args, **kwargs)
                          for iname in expr.inames]
 
@@ -174,11 +207,15 @@ def map_reduction(self, expr, *args, **kwargs):
                 new_expr,
                 allow_simultaneous=expr.allow_simultaneous)
 
-    def map_tagged_variable(self, expr, *args, **kwargs):
+    def map_tagged_variable(self, expr: TaggedVariable,
+                            *args: P.args, **kwargs: P.kwargs):
         # leaf, doesn't change
         return expr
 
-    def map_type_annotation(self, expr, *args, **kwargs):
+    def map_type_annotation(
+                self, expr: TypeAnnotation,
+                *args: P.args, **kwargs: P.kwargs
+            ) -> Expression:
         new_child = self.rec(expr.child, *args, **kwargs)
 
         if new_child is expr.child:
@@ -186,18 +223,20 @@ def map_type_annotation(self, expr, *args, **kwargs):
 
         return type(expr)(expr.type, new_child)
 
-    def map_sub_array_ref(self, expr, *args, **kwargs):
+    def map_sub_array_ref(self, expr, *args: P.args, **kwargs: P.kwargs):
         new_inames = self.rec(expr.swept_inames, *args, **kwargs)
         new_subscript = self.rec(expr.subscript, *args, **kwargs)
 
+        assert isinstance(new_inames, tuple)
+        assert isinstance(new_subscript, p.Subscript)
         if (all(new_iname is old_iname
                 for new_iname, old_iname in zip(new_inames, expr.swept_inames))
                 and new_subscript is expr.subscript):
             return expr
 
-        return SubArrayRef(new_inames, new_subscript)
+        return SubArrayRef(cast("tuple[Variable, ...]", new_inames), new_subscript)
 
-    def map_resolved_function(self, expr, *args, **kwargs):
+    def map_resolved_function(self, expr, *args: P.args, **kwargs: P.kwargs):
         # leaf, doesn't change
         return expr
 
@@ -212,25 +251,25 @@ def map_resolved_function(self, expr, *args, **kwargs):
 
 class FlattenMapper(FlattenMapperBase, IdentityMapperMixin):
     # FIXME: Lies! This needs to be made precise.
-    def is_expr_integer_valued(self, expr: ExpressionT) -> bool:
+    def is_expr_integer_valued(self, expr: Expression) -> bool:
         return True
 
 
 def flatten(expr: ArithmeticOrExpressionT) -> ArithmeticOrExpressionT:
-    return cast(ArithmeticOrExpressionT, FlattenMapper()(expr))
+    return cast("ArithmeticOrExpressionT", FlattenMapper()(expr))
 
 
-class IdentityMapper(IdentityMapperBase, IdentityMapperMixin):
+class IdentityMapper(IdentityMapperBase, IdentityMapperMixin, Generic[P]):
     pass
 
 
-class UncachedIdentityMapper(UncachedIdentityMapperBase,
-                             IdentityMapperMixin):
+class UncachedIdentityMapper(UncachedIdentityMapperBase[P],
+                             IdentityMapperMixin[P]):
     pass
 
 
 class PartialEvaluationMapper(
-        EvaluationMapperBase, IdentityMapperMixin[P]):
+        EvaluationMapperBase, IdentityMapperMixin[[]]):
     def map_variable(self, expr):
         return expr
 
@@ -238,70 +277,81 @@ def map_common_subexpression_uncached(self, expr):
         return type(expr)(self.rec(expr.child), expr.prefix, expr.scope)
 
 
-class WalkMapperMixin:
+class WalkMapperMixin(WalkMapperBase[P]):
     def map_tagged_expression(self, expr, *args, **kwargs):
         if not self.visit(expr, *args, **kwargs):
             return
 
         self.rec(expr.expr, *args, **kwargs)
-
-    def map_literal(self, expr, *args, **kwargs):
+        
+    def map_literal(self, expr, *args: P.args, **kwargs: P.kwargs) -> None:
         self.visit(expr, *args, **kwargs)
 
-    def map_array_literal(self, expr, *args, **kwargs):
+    def map_array_literal(self, expr, *args: P.args, **kwargs: P.kwargs) -> None:
         if not self.visit(expr, *args, **kwargs):
             return
 
         for ch in expr.children:
             self.rec(ch, *args, **kwargs)
 
-    def map_group_hw_index(self, expr, *args, **kwargs):
+    def map_group_hw_index(self, expr, *args: P.args, **kwargs: P.kwargs) -> None:
         self.visit(expr, *args, **kwargs)
 
-    def map_local_hw_index(self, expr, *args, **kwargs):
+    def map_local_hw_index(self, expr, *args: P.args, **kwargs: P.kwargs) -> None:
         self.visit(expr, *args, **kwargs)
 
-    def map_reduction(self, expr, *args, **kwargs):
+    def map_reduction(self, expr, *args: P.args, **kwargs: P.kwargs) -> None:
         if not self.visit(expr, *args, **kwargs):
             return
 
         self.rec(expr.expr, *args, **kwargs)
 
-    def map_type_cast(self, expr, *args, **kwargs):
+    def map_type_cast(self, expr, *args: P.args, **kwargs: P.kwargs) -> None:
         if not self.visit(expr, *args, **kwargs):
             return
         self.rec(expr.child, *args, **kwargs)
 
     map_tagged_variable = WalkMapperBase.map_variable
 
-    def map_loopy_function_identifier(self, expr, *args, **kwargs):
+    def map_loopy_function_identifier(
+                self, expr, *args: P.args, **kwargs: P.kwargs
+            ) -> None:
         self.visit(expr, *args, **kwargs)
 
-    map_linear_subscript = WalkMapperBase.map_subscript
+    def map_linear_subscript(self,
+                expr: LinearSubscript,
+                *args: P.args, **kwargs: P.kwargs) -> None:
+        if not self.visit(expr, *args, **kwargs):
+            return
+
+        self.rec(expr.aggregate, *args, **kwargs)
+        self.rec(expr.index, *args, **kwargs)
+
+        self.post_visit(expr, *args, **kwargs)
 
     map_rule_argument = map_group_hw_index
 
-    def map_sub_array_ref(self, expr, *args):
-        if not self.visit(expr):
+    def map_sub_array_ref(self, expr, *args: P.args, **kwargs: P.kwargs):
+        if not self.visit(expr, *args, **kwargs):
             return
 
-        self.rec(expr.swept_inames, *args)
-        self.rec(expr.subscript, *args)
+        self.rec(expr.swept_inames, *args, **kwargs)
+        self.rec(expr.subscript, *args, **kwargs)
 
-    def map_resolved_function(self, expr, *args):
-        if not self.visit(expr):
+    def map_resolved_function(self, expr, *args, **kwargs):
+        if not self.visit(expr, *args, **kwargs):
             return
 
-        self.rec(expr.function, *args)
+        self.rec(expr.function, *args, **kwargs)
 
     map_fortran_division = WalkMapperBase.map_quotient
 
 
-class WalkMapper(WalkMapperBase, WalkMapperMixin):
+class WalkMapper(WalkMapperMixin[P], WalkMapperBase[P]):
     pass
 
 
-class UncachedWalkMapper(UncachedWalkMapperBase, WalkMapperMixin):
+class UncachedWalkMapper(WalkMapperMixin[P], UncachedWalkMapperBase[P]):
     pass
 
 
@@ -310,26 +360,35 @@ class CallbackMapper(IdentityMapperMixin, CallbackMapperBase):
     map_resolved_function = CallbackMapperBase.map_constant
 
 
-class CombineMapper(CombineMapperBase):
+class CombineMapper(CombineMapperBase[ResultT, P]):
     def map_tagged_expression(self, expr, *args, **kwargs):
         return self.rec(expr.expr, *args, **kwargs)
-
-    def map_reduction(self, expr, *args, **kwargs):
+      
+    def map_reduction(self, expr, *args: P.args, **kwargs: P.kwargs):
         return self.rec(expr.expr, *args, **kwargs)
 
-    def map_type_cast(self, expr, *args, **kwargs):
+    def map_type_cast(self, expr, *args: P.args, **kwargs: P.kwargs):
         return self.rec(expr.child, *args, **kwargs)
 
-    def map_sub_array_ref(self, expr, *args, **kwargs):
+    def map_sub_array_ref(self, expr, *args: P.args, **kwargs: P.kwargs):
         return self.combine((
             self.rec(expr.subscript, *args, **kwargs),
             self.combine(tuple(
                          self.rec(idx, *args, **kwargs)
                          for idx in expr.swept_inames))))
 
-    map_linear_subscript = CombineMapperBase.map_subscript
+    def map_linear_subscript(self,
+                expr: LinearSubscript, *args: P.args, **kwargs: P.kwargs
+            ) -> ResultT:
+        return self.combine(
+                [self.rec(expr.aggregate, *args, **kwargs),
+                    self.rec(expr.index, *args, **kwargs)])
 
-    map_fortran_division = CombineMapperBase.map_quotient
+    def map_fortran_division(self,
+            expr: FortranDivision, *args: P.args, **kwargs: P.kwargs) -> ResultT:
+        return self.combine((
+            self.rec(expr.numerator, *args, **kwargs),
+            self.rec(expr.denominator, *args, **kwargs)))
 
 
 class SubstitutionMapper(
@@ -348,11 +407,13 @@ def map_tagged_expression(self, expr, *args):
         from pymbolic.mapper.stringifier import PREC_NONE
         return f"TaggedExpression({expr.tags}, {self.rec(expr.expr, PREC_NONE)}"
 
-    def map_literal(self, expr, *args):
+    def map_literal(self, expr: Literal, enclosing_prec: int) -> str:
         return expr.s
 
-    def map_array_literal(self, expr, *args):
-        return "{%s}" % ", ".join(self.rec(ch) for ch in expr.children)
+    def map_array_literal(self, expr, enclosing_prec):
+        from pymbolic.mapper.stringifier import PREC_NONE
+
+        return "{%s}" % ", ".join(self.rec(ch, PREC_NONE) for ch in expr.children)
 
     def map_group_hw_index(self, expr, enclosing_prec):
         return "grp.%d" % expr.index
@@ -435,80 +496,97 @@ def map_tagged_variable(self, expr, other, urecs):
             return unify_many(urecs, new_uni_record)
 
 
-class DependencyMapper(DependencyMapperBase):
-    def map_group_hw_index(self, expr, *args, **kwargs):
+class DependencyMapper(DependencyMapperBase[P]):
+    def map_group_hw_index(
+                self,
+                expr: GroupHardwareAxisIndex, *args: P.args, **kwargs: P.kwargs
+            ) -> DependenciesT:
         return set()
 
-    def map_local_hw_index(self, expr, *args, **kwargs):
+    def map_local_hw_index(
+                self,
+                expr: LocalHardwareAxisIndex, *args: P.args, **kwargs: P.kwargs
+            ) -> DependenciesT:
         return set()
 
-    def map_call(self, expr, *args, **kwargs):
+    def map_call(
+                self,
+                expr: p.Call, *args: P.args, **kwargs: P.kwargs
+            ) -> DependenciesT:
         # Loopy does not have first-class functions. Do not descend
         # into 'function' attribute of Call.
         return self.rec(expr.parameters, *args, **kwargs)
 
-    def map_reduction(self, expr, *args, **kwargs):
+    def map_reduction(
+                self,
+                expr: Reduction, *args: P.args, **kwargs: P.kwargs
+            ) -> DependenciesT:
         deps = self.rec(expr.expr, *args, **kwargs)
         return deps - {Variable(iname) for iname in expr.inames}
 
-    def map_tagged_variable(self, expr, *args, **kwargs):
+    def map_tagged_variable(
+                self,
+                expr: TaggedVariable, *args: P.args, **kwargs: P.kwargs
+            ) -> DependenciesT:
         return {expr}
 
-    def map_loopy_function_identifier(self, expr, *args, **kwargs):
+    def map_loopy_function_identifier(self, expr, *args: P.args, **kwargs: P.kwargs):
         return set()
 
     def map_tagged_expression(self, expr, *args, **kwargs):
         deps = self.rec(expr.expr, *args, **kwargs)
         return deps
 
-    def map_sub_array_ref(self, expr, *args, **kwargs):
+    def map_sub_array_ref(self, expr, *args: P.args, **kwargs: P.kwargs):
         deps = self.rec(expr.subscript, *args, **kwargs)
         return deps - set(expr.swept_inames)
 
     map_linear_subscript = DependencyMapperBase.map_subscript
 
-    def map_type_cast(self, expr, *args, **kwargs):
+    def map_type_cast(self, expr, *args: P.args, **kwargs: P.kwargs):
         return self.rec(expr.child, *args, **kwargs)
 
-    def map_resolved_function(self, expr):
-        return self.rec(expr.function)
+    def map_resolved_function(self, expr, *args: P.args, **kwargs: P.kwargs):
+        return self.rec(expr.function, *args, **kwargs)
 
-    def map_literal(self, expr):
+    def map_literal(self, expr, *args: P.args, **kwargs: P.kwargs):
         return set()
 
-    def map_call_with_kwargs(self, expr):
+    def map_call_with_kwargs(self, expr, *args: P.args, **kwargs: P.kwargs):
         # See https://github.com/inducer/loopy/pull/323
         raise NotImplementedError
 
     map_fortran_division = DependencyMapperBase.map_quotient
 
 
-class SubstitutionRuleExpander(IdentityMapper):
-    def __init__(self, rules):
+class SubstitutionRuleExpander(IdentityMapper[[]]):
+    def __init__(self, rules: Mapping[str, SubstitutionRule]) -> None:
         self.rules = rules
         super().__init__()
 
-    def __call__(self, expr, *args, **kwargs):
+    def __call__(self, expr: Expression) -> Expression:
         if not self.rules:
             return expr
-        return super().__call__(expr, *args, **kwargs)
+        return super().__call__(expr)
 
-    def map_variable(self, expr):
+    def map_variable(self, expr: Variable) -> Expression:
         if expr.name in self.rules:
-            return self.map_substitution(expr.name, self.rules[expr.name], ())
+            return self.map_subst_rule(expr.name, self.rules[expr.name], ())
         else:
             return super().map_variable(expr)
 
-    def map_call(self, expr):
+    def map_call(self, expr: p.Call) -> Expression:
+        assert isinstance(expr.function, Variable | ResolvedFunction)
         if expr.function.name in self.rules:
-            return self.map_substitution(
+            assert isinstance(expr.function.name, str)
+            return self.map_subst_rule(
                     expr.function.name,
                     self.rules[expr.function.name],
                     expr.parameters)
         else:
             return super().map_call(expr)
 
-    def map_substitution(self, name, rule, arguments):
+    def map_subst_rule(self, name, rule, arguments):
         if len(rule.arguments) != len(arguments):
             from loopy.diagnostic import LoopyError
             raise LoopyError("number of arguments to '%s' does not match "
@@ -528,7 +606,7 @@ def map_substitution(self, name, rule, arguments):
 
 # {{{ loopy-specific primitives
 
-class LoopyExpressionBase(p.Expression):
+class LoopyExpressionBase(p.ExpressionNode):
     def stringifier(self):
         from loopy.diagnostic import LoopyError
         raise LoopyError("pymbolic < 2019.1 is in use. Please upgrade.")
@@ -562,7 +640,7 @@ class ArrayLiteral(LoopyExpressionBase):
         similar mappers). Not for use in Loopy source representation.
     """
 
-    children: tuple[ExpressionT, ...]
+    children: tuple[Expression, ...]
 
 
 @p.expr_dataclass()
@@ -625,7 +703,7 @@ class TypeAnnotation(LoopyExpressionBase):
     """
 
     type: LoopyType
-    child: ExpressionT
+    child: Expression
 
 
 @p.expr_dataclass(init=False)
@@ -641,10 +719,10 @@ class TypeCast(LoopyExpressionBase):
     # numpy pickling bug madness. (see loopy.types)
     _type_name: str
 
-    child: ExpressionT
+    child: Expression
     """The expression to be cast."""
 
-    def __init__(self, type: ToLoopyTypeConvertible, child: ExpressionT):
+    def __init__(self, type: ToLoopyTypeConvertible, child: Expression):
         super().__init__()
 
         from loopy.types import NumpyType, to_loopy_type
@@ -735,8 +813,6 @@ class Reduction(LoopyExpressionBase):
     .. autoattribute:: allow_simultaneous
     """
 
-    init_arg_names = ("operation", "inames", "expr", "allow_simultaneous")
-
     operation: ReductionOperation
 
     inames: Sequence[str]
@@ -744,11 +820,11 @@ class Reduction(LoopyExpressionBase):
     carried out.
     """
 
-    expr: ExpressionT
+    expr: Expression
     """An expression which may have tuple type. If the expression has tuple
     type, it must be one of the following:
 
-    * a :class:`tuple` of :class:`pymbolic.primitives.Expression`, or
+    * a :class:`tuple` of :data:`pymbolic.typing.Expression`, or
     * a :class:`loopy.symbolic.Reduction`, or
     * a function call or substitution rule invocation.
     """
@@ -762,7 +838,7 @@ def __init__(self,
                  operation: ReductionOperation | str,
                  inames: (tuple[str | pymbolic.primitives.Variable, ...]
                      | pymbolic.primitives.Variable | str),
-                 expr: ExpressionT,
+                 expr: Expression,
                  allow_simultaneous: bool = False
              ) -> None:
         if isinstance(inames, str):
@@ -784,14 +860,17 @@ def strip_var(iname: Any) -> str:
 
         if isinstance(operation, str):
             from loopy.library.reduction import parse_reduction_op
-            operation = parse_reduction_op(operation)
+            op = parse_reduction_op(operation)
+        else:
+            op = operation
+        del operation
 
         from loopy.library.reduction import ReductionOperation
-        assert isinstance(operation, ReductionOperation)
+        assert isinstance(op, ReductionOperation)
 
         from loopy.diagnostic import LoopyError
 
-        if operation.arg_count > 1:
+        if op.arg_count > 1:
             from pymbolic.primitives import Call
 
             if not isinstance(expr, (tuple, Reduction, Call)):
@@ -805,7 +884,7 @@ def strip_var(iname: Any) -> str:
             elif isinstance(expr, Reduction) and expr.is_tuple_typed:
                 raise LoopyError("got a tuple typed argument to a scalar reduction")
 
-        object.__setattr__(self, "operation", operation)
+        object.__setattr__(self, "operation", op)
         object.__setattr__(self, "inames", inames)
         object.__setattr__(self, "expr", expr)
         object.__setattr__(self, "allow_simultaneous", allow_simultaneous)
@@ -824,8 +903,8 @@ class LinearSubscript(LoopyExpressionBase):
     """Represents a linear index into a multi-dimensional array, completely
     ignoring any multi-dimensional layout.
     """
-    aggregate: ExpressionT
-    index: ExpressionT
+    aggregate: Expression
+    index: Expression
 
 
 @p.expr_dataclass()
@@ -887,19 +966,19 @@ def map_variable(self, expr):
             return expr
 
 
-class VariableInAnExpression(CombineMapper):
-    def __init__(self, variables_to_search):
+class VariableInAnExpression(CombineMapper[bool, []]):
+    def __init__(self, variables_to_search: Collection[Variable]) -> None:
         assert all(isinstance(variable, Variable) for variable in
             variables_to_search)
         self.variables_to_search = variables_to_search
 
-    def combine(self, values):
+    def combine(self, values) -> bool:
         return any(values)
 
-    def map_variable(self, expr):
+    def map_variable(self, expr) -> bool:
         return expr in self.variables_to_search
 
-    def map_constant(self, expr):
+    def map_constant(self, expr) -> bool:
         return False
 
 
@@ -976,6 +1055,7 @@ def __post_init__(self):
         assert isinstance(self.subscript, p.Subscript)
 
 
+@p.expr_dataclass()
 class FortranDivision(p.QuotientBase, LoopyExpressionBase):
     """This exists for the benefit of the Fortran frontend, which specializes
     to floating point division for floating point inputs and round-to-zero
@@ -987,46 +1067,51 @@ class FortranDivision(p.QuotientBase, LoopyExpressionBase):
         This is not a documented expression node type. It may disappear
         at any moment.
     """
-    mapper_method = "map_fortran_division"
 
 # }}}
 
 
-class DependencyMapperWithReductionInames(DependencyMapper):
-    def __init__(self, *args, **kwargs):
+class DependencyMapperWithReductionInames(DependencyMapper[P]):
+    def __init__(self, *args, **kwargs) -> None:
         super().__init__(*args, **kwargs)
-        self.reduction_inames = set()
+        self.reduction_inames: set[str] = set()
 
-    def map_reduction(self, expr, *args, **kwargs):
+    def map_reduction(
+                self,
+                expr: Reduction, *args: P.args, **kwargs: P.kwargs
+            ) -> DependenciesT:
         self.reduction_inames.update(expr.inames)
         return super().map_reduction(expr, *args, **kwargs)
 
 
 @memoize
-def _get_dependencies_and_reduction_inames(expr):
-    dep_mapper = DependencyMapperWithReductionInames(composite_leaves=False)
-    deps = frozenset(dep.name for dep in dep_mapper(expr))
+def _get_dependencies_and_reduction_inames(
+            expr: Expression
+        ) -> tuple[AbstractSet[str], AbstractSet[str]]:
+    dep_mapper: DependencyMapperWithReductionInames[[]] = \
+        DependencyMapperWithReductionInames(composite_leaves=False)
+    deps = frozenset(cast("Variable", dep).name for dep in dep_mapper(expr))
     reduction_inames = dep_mapper.reduction_inames
     return deps, reduction_inames
 
 
-def get_dependencies(expr: ExpressionT | type[auto]) -> AbstractSet[str]:
+def get_dependencies(expr: Expression) -> AbstractSet[str]:
     return _get_dependencies_and_reduction_inames(expr)[0]
 
 
-def get_reduction_inames(expr: ExpressionT) -> AbstractSet[str]:
+def get_reduction_inames(expr: Expression) -> AbstractSet[str]:
     return _get_dependencies_and_reduction_inames(expr)[1]
 
 
-class SubArrayRefSweptInamesCollector(CombineMapper):
-    def combine(self, values):
+class SubArrayRefSweptInamesCollector(CombineMapper[AbstractSet[str], []]):
+    def combine(self, values: Iterable[AbstractSet[str]]) -> AbstractSet[str]:
         import operator
         return reduce(operator.or_, values, frozenset())
 
-    def map_sub_array_ref(self, expr):
+    def map_sub_array_ref(self, expr) -> AbstractSet[str]:
         return frozenset({iname.name for iname in expr.swept_inames})
 
-    def map_constant(self, expr):
+    def map_constant(self, expr) -> AbstractSet[str]:
         return frozenset()
 
     map_variable = map_constant
@@ -1055,7 +1140,8 @@ def parse_tagged_name(expr):
         raise RuntimeError("subst rule name not understood: %s" % expr)
 
 
-class ExpansionState(ImmutableRecord):
+@dataclass(frozen=True)
+class ExpansionState:
     """
     .. attribute:: kernel
     .. attribute:: instruction
@@ -1069,19 +1155,18 @@ class ExpansionState(ImmutableRecord):
 
         a dict representing current argument values
     """
-    def __init__(self, kernel, instruction, stack, arg_context):
-        if not isinstance(arg_context, immutables.Map):
-            warn(f"Got a {type(arg_context)} for arg_context,"
-                 " expected `immutables.Map`. This is deprecated"
-                 " and will result in an error from 2023.",
-                 DeprecationWarning, stacklevel=2)
-            arg_context = immutables.Map(arg_context)
-        super().__init__(kernel=kernel,
-                         instruction=instruction,
-                         stack=stack,
-                         arg_context=arg_context)
-
-    def __hash__(self):
+    kernel: LoopKernel
+    instruction: InstructionBase
+    stack: tuple[tuple[str, Tag], ...]
+    arg_context: immutables.Map[str, Expression]
+
+    def __post_init__(self) -> None:
+        hash(self.arg_context)
+
+    def copy(self, **kwargs: Any) -> Self:
+        return replace(self, **kwargs)
+
+    def __hash__(self) -> int:
         # do not try to be precise about hash of loopy kernel
         # or the instruction as computing the hash of pymbolic
         # expressions could have exponential complexity
@@ -1089,8 +1174,8 @@ def __hash__(self):
                      self.stack, self.arg_context))
 
     @property
-    def insn_id(self):
-        return self.instruction.id
+    def insn_id(self) -> str:
+        return not_none(self.instruction.id)
 
     def apply_arg_context(self, expr):
         from pymbolic.mapper.substitutor import make_subst_func
@@ -1098,34 +1183,34 @@ def apply_arg_context(self, expr):
                 make_subst_func(self.arg_context))(expr)
 
 
-class SubstitutionRuleRenamer(IdentityMapper):
+class SubstitutionRuleRenamer(IdentityMapper[[]]):
     def __init__(self, renames):
         self.renames = renames
         super().__init__()
 
-    def map_call(self, expr):
+    def map_call(self, expr: p.Call) -> Expression:
         if not isinstance(expr.function, p.Variable):
-            return IdentityMapper.map_call(self, expr)
+            return super().map_call(expr)
 
         name, tags = parse_tagged_name(expr.function)
 
         new_name = self.renames.get(name)
         if new_name is None:
-            return IdentityMapper.map_call(self, expr)
+            return super().map_call(expr)
 
         if tags:
-            sym = TaggedVariable(new_name, tags)
+            sym: p.Variable = TaggedVariable(new_name, tags)
         else:
             sym = p.Variable(new_name)
 
         return type(expr)(sym, tuple(self.rec(child) for child in expr.parameters))
 
-    def map_variable(self, expr):
+    def map_variable(self, expr: Variable) -> Expression:
         name, tags = parse_tagged_name(expr)
 
         new_name = self.renames.get(name)
         if new_name is None:
-            return IdentityMapper.map_variable(self, expr)
+            return super().map_variable(expr)
 
         if tags:
             return TaggedVariable(new_name, tags)
@@ -1256,7 +1341,7 @@ def finish_kernel(self, kernel):
             instructions=new_insns)
 
 
-class RuleAwareIdentityMapper(IdentityMapper):
+class RuleAwareIdentityMapper(IdentityMapper[Concatenate[ExpansionState, P]]):
     """Note: the third argument dragged around by this mapper is the
     current :class:`ExpansionState`.
 
@@ -1264,30 +1349,33 @@ class RuleAwareIdentityMapper(IdentityMapper):
     are in :attr:`ExpansionState.arg_context`.
     """
 
-    def __init__(self, rule_mapping_context):
+    def __init__(self, rule_mapping_context: SubstitutionRuleMappingContext) -> None:
         self.rule_mapping_context = rule_mapping_context
         super().__init__()
 
-    def map_variable(self, expr, expn_state, *args, **kwargs):
+    def map_variable(
+                self, expr: Variable, expn_state: ExpansionState,
+                *args: P.args, **kwargs: P.kwargs
+            ) -> Expression:
         name, tags = parse_tagged_name(expr)
         if name not in self.rule_mapping_context.old_subst_rules:
-            return IdentityMapper.map_variable(self, expr, expn_state, *args,
-                    **kwargs)
+            return super().map_variable(expr, expn_state, *args, **kwargs)
         else:
-            return self.map_substitution(name, tags, (), expn_state, *args,
-                    **kwargs)
+            return self.map_subst_rule(name, tags, (), expn_state, *args, **kwargs)
 
-    def map_call(self, expr, expn_state, *args, **kwargs):
+    def map_call(
+                self, expr: p.Call, expn_state: ExpansionState,
+                *args: P.args, **kwargs: P.kwargs
+            ) -> Expression:
         if not isinstance(expr.function, p.Variable):
-            return IdentityMapper.map_call(self, expr, expn_state,
-                                           *args, **kwargs)
+            return super().map_call(expr, expn_state, *args, **kwargs)
 
         name, tags = parse_tagged_name(expr.function)
 
         if name not in self.rule_mapping_context.old_subst_rules:
             return super().map_call(expr, expn_state, *args, **kwargs)
         else:
-            return self.map_substitution(name, tags,
+            return self.map_subst_rule(name, tags,
                                          self.rec(expr.parameters,
                                                   expn_state,
                                                   *args,
@@ -1299,9 +1387,9 @@ def map_call(self, expr, expn_state, *args, **kwargs):
     def make_new_arg_context(
             rule_name: str,
             arg_names: Sequence[str],
-            arguments: Sequence[ExpressionT],
-            arg_context: Mapping[str, ExpressionT]
-            ) -> Mapping[str, ExpressionT]:
+            arguments: Sequence[Expression],
+            arg_context: Mapping[str, Expression]
+            ) -> Mapping[str, Expression]:
         if len(arg_names) != len(arguments):
             raise RuntimeError("Rule '%s' invoked with %d arguments (needs %d)"
                     % (rule_name, len(arguments), len(arg_names), ))
@@ -1312,14 +1400,17 @@ def make_new_arg_context(
             formal_arg_name: arg_subst_map(arg_value)
             for formal_arg_name, arg_value in zip(arg_names, arguments)})
 
-    def map_substitution(self, name, tags, arguments, expn_state,
-                         *args, **kwargs):
+    def map_subst_rule(
+                self, name: str, tags, arguments, expn_state: ExpansionState,
+                *args: P.args, **kwargs: P.kwargs
+            ) -> Expression:
         rule = self.rule_mapping_context.old_subst_rules[name]
 
         rec_arguments = self.rec(arguments, expn_state, *args, **kwargs)
+        assert isinstance(rec_arguments, tuple)
 
         new_expn_state = expn_state.copy(
-                stack=expn_state.stack + ((name, tags),),
+                stack=(*expn_state.stack, (name, tags)),
                 arg_context=self.make_new_arg_context(
                     name, rule.arguments, rec_arguments, expn_state.arg_context))
 
@@ -1329,7 +1420,7 @@ def map_substitution(self, name, tags, arguments, expn_state,
                 name, rule.arguments, result)
 
         if tags:
-            sym = TaggedVariable(new_name, tags)
+            sym: p.Variable = TaggedVariable(new_name, tags)
         else:
             sym = p.Variable(new_name)
 
@@ -1356,8 +1447,8 @@ def __call__(self, expr, kernel, insn):
     def map_instruction(self, kernel, insn):
         return insn
 
-    def map_kernel(self, kernel, within=lambda *args: True,
-            map_args=True, map_tvs=True):
+    def map_kernel(self, kernel: LoopKernel, within=lambda *args: True,
+            map_args: bool = True, map_tvs: bool = True) -> LoopKernel:
         new_insns = [
             # While subst rules are not allowed in assignees, the mapper
             # may perform tasks entirely unrelated to subst rules, so
@@ -1377,7 +1468,7 @@ def map_kernel(self, kernel, within=lambda *args: True,
         # {{{ args
 
         if map_args:
-            new_args = [
+            new_args: Sequence[KernelArgument] = [
                 arg.map_exprs(non_insn_self) if isinstance(arg, ArrayBase) else arg
                 for arg in kernel.args]
         else:
@@ -1388,11 +1479,11 @@ def map_kernel(self, kernel, within=lambda *args: True,
         # {{{ tvs
 
         if map_tvs:
-            new_tvs = {
+            new_tvs: Mapping[str, TemporaryVariable] = {
                 tv_name: tv.map_exprs(non_insn_self)
                 for tv_name, tv in kernel.temporary_variables.items()}
         else:
-            new_tvs = kernel.temporary_variables.copy()
+            new_tvs = kernel.temporary_variables
 
         # }}}
 
@@ -1403,7 +1494,7 @@ def map_kernel(self, kernel, within=lambda *args: True,
                            temporary_variables=new_tvs)
 
 
-class RuleAwareSubstitutionMapper(RuleAwareIdentityMapper):
+class RuleAwareSubstitutionMapper(RuleAwareIdentityMapper[[]]):
     """
     Mapper to substitute expressions and record any divergence of substitution
     rule expressions of :class:`loopy.LoopKernel`.
@@ -1440,7 +1531,7 @@ def within(self, kernel, instruction, stack):
         else:
             return self._within(kernel, instruction, stack)
 
-    def map_variable(self, expr, expn_state):
+    def map_variable(self, expr: Variable, expn_state: ExpansionState) -> Expression:
         if (expr.name in expn_state.arg_context
                 or not self.within(
                     expn_state.kernel, expn_state.instruction, expn_state.stack)):
@@ -1456,15 +1547,17 @@ def map_variable(self, expr, expn_state):
                     expr, expn_state)
 
 
-class RuleAwareSubstitutionRuleExpander(RuleAwareIdentityMapper):
+class RuleAwareSubstitutionRuleExpander(RuleAwareIdentityMapper[[]]):
     def __init__(self, rule_mapping_context, rules, within):
         super().__init__(rule_mapping_context)
 
         self.rules = rules
         self.within = within
 
-    def map_substitution(self, name, tags, arguments, expn_state):
-        new_stack = expn_state.stack + ((name, tags),)
+    def map_subst_rule(
+                self, name: str, tags, arguments, expn_state: ExpansionState
+            ) -> Expression:
+        new_stack = (*expn_state.stack, (name, tags))
 
         if self.within(expn_state.kernel, expn_state.instruction, new_stack):
             # expand
@@ -1486,31 +1579,38 @@ def map_substitution(self, name, tags, arguments, expn_state):
 
         else:
             # do not expand
-            return super().map_substitution(
-                    name, tags, arguments, expn_state)
+            return super().map_subst_rule(name, tags, arguments, expn_state)
 
 # }}}
 
 
 # {{{ functions to primitives, parsing
 
-class VarToTaggedVarMapper(IdentityMapper):
-    def map_variable(self, expr):
+class VarToTaggedVarMapper(IdentityMapper[[]]):
+    def map_variable(self, expr: Variable) -> Variable:
         dollar_idx = expr.name.find("$")
         if dollar_idx == -1:
             return expr
         else:
-            return TaggedVariable(expr.name[:dollar_idx],
-                    expr.name[dollar_idx+1:])
+            from loopy.kernel.instruction import LegacyStringInstructionTag
+            return TaggedVariable(
+                        expr.name[:dollar_idx],
+                        frozenset({
+                            LegacyStringInstructionTag(expr.name[dollar_idx+1:])
+                        })
+                    )
 
 
-class FunctionToPrimitiveMapper(UncachedIdentityMapper):
+class FunctionToPrimitiveMapper(UncachedIdentityMapper[[]]):
     """Looks for invocations of a function called 'cse' or 'reduce' and
     turns those into the actual pymbolic primitives used for that.
     """
 
-    def _parse_reduction(self, operation, inames, red_exprs,
-            allow_simultaneous=False):
+    def _parse_reduction(self,
+                operation: ReductionOperation,
+                inames: Expression,
+                red_exprs: tuple[Expression, ...],
+                allow_simultaneous: bool = False) -> Reduction:
         if isinstance(inames, p.Variable):
             inames = (inames,)
 
@@ -1518,7 +1618,7 @@ def _parse_reduction(self, operation, inames, red_exprs,
             raise TypeError("iname argument to reduce() must be a symbol "
                     "or a list/tuple of symbols")
 
-        processed_inames = []
+        processed_inames: list[str] = []
         for iname in inames:
             if not isinstance(iname, p.Variable):
                 raise TypeError("iname argument to reduce() must be a symbol "
@@ -1527,16 +1627,18 @@ def _parse_reduction(self, operation, inames, red_exprs,
             processed_inames.append(iname.name)
 
         if len(red_exprs) == 1:
-            red_exprs = red_exprs[0]
+            expr_or_exprs: Expression | tuple[Expression, ...] = red_exprs[0]
+        else:
+            expr_or_exprs = red_exprs
 
-        return Reduction(operation, tuple(processed_inames), red_exprs,
+        return Reduction(operation, tuple(processed_inames), expr_or_exprs,
                 allow_simultaneous=allow_simultaneous)
 
-    def map_call(self, expr):
+    def map_call(self, expr: p.Call) -> Expression:
         from loopy.library.reduction import parse_reduction_op
 
         if not isinstance(expr.function, p.Variable):
-            return IdentityMapper.map_call(self, expr)
+            return super().map_call(expr)
 
         name = expr.function.name
         if name == "cse":
@@ -1557,11 +1659,11 @@ def map_call(self, expr):
         elif name in ["reduce", "simul_reduce"]:
 
             if len(expr.parameters) >= 3:
-                operation, inames = expr.parameters[:2]
+                op_expr, inames = expr.parameters[:2]
                 red_exprs = expr.parameters[2:]
 
-                operation = parse_reduction_op(str(operation))
-                return self._parse_reduction(operation, inames,
+                operation = parse_reduction_op(str(op_expr))
+                return self._parse_reduction(not_none(operation), inames,
                         tuple(self.rec(red_expr) for red_expr in red_exprs),
                         allow_simultaneous=(name == "simul_reduce"))
             else:
@@ -1602,18 +1704,22 @@ def map_call(self, expr):
                 return self._parse_reduction(operation, inames, red_exprs)
 
             else:
-                return IdentityMapper.map_call(self, expr)
+                return super().map_call(expr)
 
 
 # {{{ customization to pymbolic parser
 
 _open_dbl_bracket = intern("open_dbl_bracket")
 
-TRAILING_FLOAT_TAG_RE = re.compile("^(.*?)([a-zA-Z]*)$")
+TRAILING_FLOAT_TAG_RE = re.compile(r"^(.*?)([a-zA-Z]*)$")
+
+
+LexTable: TypeAlias = Sequence[
+        tuple[str, pytools.lex.RE | tuple[str | pytools.lex.RE, ...]]]
 
 
 class LoopyParser(ParserBase):
-    lex_table = [
+    lex_table: ClassVar[LexTable] = [
             (_open_dbl_bracket, pytools.lex.RE(r"\[\[")),
             *ParserBase.lex_table
             ]
@@ -1724,8 +1830,8 @@ def map_subscript(self, expr):
 
 # {{{ variable index expression collector
 
-class ArrayAccessFinder(CombineMapper):
-    def __init__(self, tgt_vector_name=None):
+class ArrayAccessFinder(CombineMapper[AbstractSet[p.Subscript], []]):
+    def __init__(self, tgt_vector_name: str | None = None) -> None:
         self.tgt_vector_name = tgt_vector_name
         super().__init__()
 
@@ -1733,27 +1839,27 @@ def combine(self, values):
         from pytools import flatten
         return set(flatten(values))
 
-    def map_constant(self, expr):
+    def map_constant(self, expr: object) -> AbstractSet[p.Subscript]:
         return set()
 
-    def map_algebraic_leaf(self, expr):
+    def map_algebraic_leaf(self, expr) -> AbstractSet[p.Subscript]:
         return set()
 
-    def map_subscript(self, expr):
+    def map_subscript(self, expr) -> AbstractSet[p.Subscript]:
         assert isinstance(expr.aggregate, p.Variable)
 
         if self.tgt_vector_name is None \
                 or expr.aggregate.name == self.tgt_vector_name:
             return {expr} | self.rec(expr.index)
         else:
-            return CombineMapper.map_subscript(self, expr)
+            return super().map_subscript(expr)
 
 # }}}
 
 
 # {{{ (pw)aff to expr conversion
 
-def aff_to_expr(aff: isl.Aff) -> ArithmeticExpressionT:
+def aff_to_expr(aff: isl.Aff) -> ArithmeticExpression:
     from pymbolic import var
 
     denom = aff.get_denominator_val().to_python()
@@ -1774,7 +1880,7 @@ def aff_to_expr(aff: isl.Aff) -> ArithmeticExpressionT:
     return flatten(result // denom)
 
 
-def pw_aff_to_expr(pw_aff: isl.PwAff, int_ok: bool = False) -> ExpressionT:
+def pw_aff_to_expr(pw_aff: isl.PwAff, int_ok: bool = False) -> Expression:
     if isinstance(pw_aff, int):
         if not int_ok:
             warn("expected PwAff, got int", stacklevel=2)
@@ -1816,7 +1922,7 @@ def pw_aff_to_pw_aff_implemented_by_expr(pw_aff: isl.PwAff) -> isl.PwAff:
 
 # {{{ (pw)aff_from_expr
 
-class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin):
+class PwAffEvaluationMapper(EvaluationMapperBase[isl.PwAff], IdentityMapperMixin[[]]):
     def __init__(self, space, vars_to_zero):
         self.zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(space))
 
@@ -1869,7 +1975,7 @@ def map_remainder(self, expr):
             raise TypeError("modulo non-constant in '%s' not supported "
                     "for as-pwaff evaluation" % expr)
 
-        (s, denom_aff), = denom.get_pieces()
+        (_s, denom_aff), = denom.get_pieces()
         denom = denom_aff.get_constant_val()
 
         return num.mod_val(denom)
@@ -1888,7 +1994,7 @@ def map_call(self, expr):
                 "for as-pwaff evaluation")
 
 
-def aff_from_expr(space: isl.Space, expr: ExpressionT, vars_to_zero=None) -> isl.Aff:
+def aff_from_expr(space: isl.Space, expr: Expression, vars_to_zero=None) -> isl.Aff:
     if vars_to_zero is None:
         vars_to_zero = frozenset()
 
@@ -1896,7 +2002,7 @@ def aff_from_expr(space: isl.Space, expr: ExpressionT, vars_to_zero=None) -> isl
 
     pieces = pwaff.get_pieces()
     if len(pieces) == 1:
-        (s, aff), = pieces
+        (_s, aff), = pieces
         return aff
     else:
         from loopy.diagnostic import ExpressionNotAffineError
@@ -1952,7 +2058,7 @@ def guarded_pwaff_from_expr(space, expr, vars_to_zero=None):
 
 # {{{ (pw_)?qpoly_from_expr
 
-class PwQPolyEvaluationMapper(EvaluationMapperBase):
+class PwQPolyEvaluationMapper(EvaluationMapperBase[isl.PwQPolynomial]):
     def __init__(self, space, vars_to_zero):
         zero_qpoly = isl.QPolynomial.zero_on_domain(space)
 
@@ -2000,7 +2106,7 @@ def qpolynomial_from_expr(space, expr):
 
     pieces = pw_qpoly.get_pieces()
     if len(pieces) == 1:
-        (s, qpoly), = pieces
+        (_s, qpoly), = pieces
         return qpoly
     else:
         raise RuntimeError("expression '%s' could not be converted to a "
@@ -2029,7 +2135,7 @@ def simplify_using_aff(kernel, expr):
     """
     Simplifies *expr* on *kernel*'s domain.
 
-    :arg expr: An instance of :class:`pymbolic.primitives.Expression`.
+    :arg expr: An instance of :data:`pymbolic.typing.Expression`.
     """
     deps = get_dependencies(expr)
 
@@ -2153,7 +2259,7 @@ def constraint_to_cond_expr(cns):
 
 # {{{ isl_set_from_expr
 
-class ConditionExpressionToBooleanOpsExpression(IdentityMapper):
+class ConditionExpressionToBooleanOpsExpression(IdentityMapper[[]]):
     """
     Mapper to convert expressions into composition of boolean operation nodes
     according to C-semantics.
@@ -2183,7 +2289,7 @@ def map_reduction(self, expr):
                 "to affine")
 
 
-class AffineConditionToISLSetMapper(IdentityMapper):
+class AffineConditionToISLSetMapper(IdentityMapper[[]]):
     """
     Mapper to convert a condition :class:`~pymbolic.primitives.Expression` to a
     :class:`~islpy.Set`.
@@ -2304,15 +2410,20 @@ def set_to_cond_expr(isl_set):
 
 # {{{ Reduction callback mapper
 
-class ReductionCallbackMapper(UncachedIdentityMapper):
-    def __init__(self, callback):
+class ReductionCallbackMapper(UncachedIdentityMapper[P]):
+    def __init__(
+                self,
+                callback: Callable[[
+                    Reduction,
+                    Callable[Concatenate[Expression, P], Expression]
+                ], Expression]) -> None:
         self.callback = callback
         super().__init__()
 
-    def map_reduction(self, expr, **kwargs):
+    def map_reduction(self, expr, *args: P.args, **kwargs: P.kwargs) -> Expression:
         result = self.callback(expr, self.rec, **kwargs)
         if result is None:
-            return IdentityMapper.map_reduction(self, expr, **kwargs)
+            return super().map_reduction(expr, *args, **kwargs)
         return result
 
 # }}}
@@ -2320,8 +2431,8 @@ def map_reduction(self, expr, **kwargs):
 
 # {{{ index dependency finding
 
-class IndexVariableFinder(CombineMapper):
-    def __init__(self, include_reduction_inames):
+class IndexVariableFinder(CombineMapper[AbstractSet[Expression], []]):
+    def __init__(self, include_reduction_inames: bool) -> None:
         self.include_reduction_inames = include_reduction_inames
 
     def combine(self, values):
@@ -2362,11 +2473,11 @@ def map_reduction(self, expr):
 # {{{ wildcard -> unique variable mapper
 
 class WildcardToUniqueVariableMapper(IdentityMapper):
-    def __init__(self, unique_var_name_factory):
+    def __init__(self, unique_var_name_factory: Callable[[], str]) -> None:
         self.unique_var_name_factory = unique_var_name_factory
         super().__init__()
 
-    def map_wildcard(self, expr):
+    def map_wildcard(self, expr: p.Wildcard) -> Variable:
         from pymbolic import var
         return var(self.unique_var_name_factory())
 
@@ -2375,7 +2486,7 @@ def map_wildcard(self, expr):
 
 # {{{ prime ("'") adder
 
-class PrimeAdder(IdentityMapper):
+class PrimeAdder(IdentityMapper[[]]):
     def __init__(self, which_vars):
         self.which_vars = which_vars
 
@@ -2520,7 +2631,7 @@ def get_access_range(domain, subscript, assumptions=None, shape=None,
 
 # {{{ access range mapper
 
-class BatchedAccessMapMapper(WalkMapper):
+class BatchedAccessMapMapper(WalkMapper[[AbstractSet[str]]]):
 
     def __init__(self, kernel, var_names, overestimate=False):
         self.kernel = kernel
@@ -2531,7 +2642,7 @@ def __init__(self, kernel, var_names, overestimate=False):
         self._var_names = set(var_names)
         super().__init__()
 
-    def get_access_range(self, var_name):
+    def get_access_range(self, var_name: str) -> isl.Set:
         loops_to_amaps = self.access_maps[var_name]
         if not loops_to_amaps:
             return None
@@ -2540,9 +2651,9 @@ def get_access_range(self, var_name):
         from functools import reduce
         return reduce(operator.or_, (val.range() for val in loops_to_amaps.values()))
 
-    def map_subscript(self, expr, inames):
+    def map_subscript(self, expr: p.Subscript, inames: AbstractSet[str]) -> None:
         domain = self.kernel.get_inames_domain(inames)
-        WalkMapper.map_subscript(self, expr, inames)
+        super().map_subscript(expr, inames)
 
         assert isinstance(expr.aggregate, p.Variable)
 
@@ -2585,19 +2696,21 @@ def map_subscript(self, expr, inames):
         else:
             self.access_maps[arg_name][inames] |= access_map
 
-    def map_linear_subscript(self, expr, inames):
+    def map_linear_subscript(
+                self,
+                expr: LinearSubscript, inames: AbstractSet[str]
+            ) -> None:
         self.rec(expr.index, inames)
 
+        assert isinstance(expr.aggregate, Variable)
         if expr.aggregate.name in self._var_names:
             self.bad_subscripts[expr.aggregate.name].append(expr)
 
     def map_reduction(self, expr, inames):
         return WalkMapper.map_reduction(self, expr, inames | set(expr.inames))
 
-    def map_type_cast(self, expr, inames):
-        return self.rec(expr.child, inames)
-
-    def map_sub_array_ref(self, expr, inames):
+    def map_sub_array_ref(self, expr: SubArrayRef, inames: AbstractSet[str]) -> None:
+        assert isinstance(expr.subscript.aggregate, Variable)
         arg_name = expr.subscript.aggregate.name
         if arg_name not in self._var_names:
             return
@@ -2739,14 +2852,16 @@ def do_access_ranges_overlap_conservative(
 
 # {{{ is_expression_equal
 
-def is_expression_equal(a, b):
+def is_expression_equal(a: Expression, b: Expression) -> bool:
     if a == b:
         return True
 
-    if isinstance(a, p.Expression) or isinstance(b, p.Expression):
+    if isinstance(a, p.ExpressionNode) or isinstance(b, p.ExpressionNode):
         if a is None or b is None:
             return False
 
+        assert p.is_arithmetic_expression(a)
+        assert p.is_arithmetic_expression(b)
         maybe_zero = a - b
         from pymbolic import distribute
 
@@ -2757,7 +2872,10 @@ def is_expression_equal(a, b):
         return False
 
 
-def is_tuple_of_expressions_equal(a, b):
+def is_tuple_of_expressions_equal(
+            a: Expression | None,
+            b: Expression | None,
+        ) -> bool:
     if a is None or b is None:
         if a is None and b is None:
             return True
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 48ab04f89..00f1891bd 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -10,15 +10,6 @@
 .. autoclass:: OpenCLTarget
 .. autoclass:: PyOpenCLTarget
 .. autoclass:: ISPCTarget
-
-References to Canonical Names
------------------------------
-
-.. currentmodule:: loopy.target
-
-.. class:: TargetBase
-
-    See :class:`loopy.TargetBase`.
 """
 
 from __future__ import annotations
@@ -52,9 +43,7 @@
     Any,
     ClassVar,
     Generic,
-    Optional,
     Sequence,
-    Tuple,
     TypeVar,
 )
 
@@ -64,7 +53,7 @@
     from loopy.codegen.result import CodeGenerationResult
     from loopy.target.execution import ExecutorBase
     from loopy.translation_unit import FunctionIdT, TranslationUnit
-    from loopy.typing import ExpressionT
+    from loopy.typing import Expression
 
 
 ASTType = TypeVar("ASTType")
@@ -79,8 +68,8 @@ class TargetBase:
 
     # {{{ hashing/equality
 
-    hash_fields: ClassVar[Tuple[str, ...]] = ()
-    comparison_fields: ClassVar[Tuple[str, ...]] = ()
+    hash_fields: ClassVar[tuple[str, ...]] = ()
+    comparison_fields: ClassVar[tuple[str, ...]] = ()
 
     def __hash__(self):
         # NOTE: _hash_value may vanish during pickling
@@ -226,7 +215,7 @@ def get_function_definition(
     def get_function_declaration(
             self, codegen_state: CodeGenerationState,
             codegen_result: CodeGenerationResult, schedule_index: int
-            ) -> Tuple[Sequence[Tuple[str, str]], Optional[ASTType]]:
+            ) -> tuple[Sequence[tuple[str, str]], ASTType | None]:
         """Returns preambles and the AST for the function declaration."""
         raise NotImplementedError
 
@@ -240,8 +229,8 @@ def get_temporary_decls(self, codegen_state: CodeGenerationState,
 
     def get_kernel_call(self, codegen_state: CodeGenerationState,
             subkernel_name: str,
-            gsize: Tuple[ExpressionT, ...],
-            lsize: Tuple[ExpressionT, ...]) -> Optional[ASTType]:
+            gsize: tuple[Expression, ...],
+            lsize: tuple[Expression, ...]) -> ASTType | None:
         raise NotImplementedError()
 
     @property
@@ -331,7 +320,7 @@ def get_function_definition(self, codegen_state, codegen_result,
     def get_function_declaration(
             self, codegen_state, codegen_result,
             schedule_index,
-            ) -> Tuple[Sequence[Tuple[str, str]], None]:
+            ) -> tuple[Sequence[tuple[str, str]], None]:
         return [], None
 
     def get_temporary_decls(self, codegen_state, schedule_index):
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 9f227bd37..a4990b5c6 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -1,4 +1,5 @@
 """Plain C target and base for other C-family languages."""
+from __future__ import annotations
 
 
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
@@ -24,9 +25,9 @@
 """
 
 import re
-from typing import Any, Optional, Sequence, Tuple, cast
+from typing import TYPE_CHECKING, Any, Sequence, cast
 
-import numpy as np  # noqa
+import numpy as np
 
 import pymbolic.primitives as p
 from cgen import (
@@ -43,10 +44,7 @@
 from pymbolic.mapper.stringifier import PREC_NONE
 from pytools import memoize_method
 
-from loopy.codegen import CodeGenerationState
-from loopy.codegen.result import CodeGenerationResult
 from loopy.diagnostic import LoopyError, LoopyTypeError
-from loopy.kernel import LoopKernel
 from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag
 from loopy.kernel.data import (
     AddressSpace,
@@ -57,14 +55,20 @@
     ValueArg,
 )
 from loopy.kernel.function_interface import ScalarCallable
-from loopy.schedule import CallKernel
 from loopy.symbolic import IdentityMapper
 from loopy.target import ASTBuilderBase, DummyHostASTBuilder, TargetBase
-from loopy.target.execution import ExecutorBase
 from loopy.tools import remove_common_indentation
-from loopy.translation_unit import FunctionIdT, TranslationUnit
 from loopy.types import LoopyType, NumpyType, to_loopy_type
-from loopy.typing import ExpressionT, auto
+from loopy.typing import Expression, auto
+
+
+if TYPE_CHECKING:
+    from loopy.codegen import CodeGenerationState
+    from loopy.codegen.result import CodeGenerationResult
+    from loopy.kernel import LoopKernel
+    from loopy.schedule import CallKernel
+    from loopy.target.execution import ExecutorBase
+    from loopy.translation_unit import FunctionIdT, TranslationUnit
 
 
 __doc__ = """
@@ -259,7 +263,7 @@ def _preamble_generator(preamble_info, func_qualifier="inline"):
             inline {res_ctype} {func.c_name}({base_ctype} x, {exp_ctype} n) {{
               if (n == 0)
                 return 1;
-              {re.sub("^", 14*" ", signed_exponent_preamble, flags=re.M)}
+              {re.sub(r"^", 14*" ", signed_exponent_preamble, flags=re.M)}
 
               {res_ctype} y = 1;
 
@@ -414,8 +418,8 @@ class CFamilyTarget(TargetBase):
     usable as a common base for C99, C++, OpenCL, CUDA, and the like.
     """
 
-    hash_fields = TargetBase.hash_fields + ("fortran_abi",)
-    comparison_fields = TargetBase.comparison_fields + ("fortran_abi",)
+    hash_fields = (*TargetBase.hash_fields, "fortran_abi")
+    comparison_fields = (*TargetBase.comparison_fields, "fortran_abi")
 
     def __init__(self, fortran_abi=False):
         self.fortran_abi = fortran_abi
@@ -772,16 +776,13 @@ class CFamilyASTBuilder(ASTBuilderBase[Generable]):
 
     def symbol_manglers(self):
         return (
-                super().symbol_manglers() + [
-                    c_symbol_mangler
-                    ])
+                [*super().symbol_manglers(), c_symbol_mangler])
 
     def preamble_generators(self):
         return (
-                super().preamble_generators() + [
-                    lambda preamble_info: _preamble_generator(preamble_info,
-                        self.preamble_function_qualifier),
-                    ])
+                [*super().preamble_generators(),
+                    lambda preamble_info: _preamble_generator(
+                          preamble_info, self.preamble_function_qualifier)])
 
     @property
     def known_callables(self):
@@ -794,9 +795,12 @@ def known_callables(self):
     # {{{ code generation
 
     def get_function_definition(
-            self, codegen_state: CodeGenerationState,
+            self,
+            codegen_state: CodeGenerationState,
             codegen_result: CodeGenerationResult,
-            schedule_index: int, function_decl: Generable, function_body: Generable
+            schedule_index: int,
+            function_decl: Generable,
+            function_body: Generable
             ) -> Generable:
         kernel = codegen_state.kernel
         assert kernel.linearization is not None
@@ -828,37 +832,44 @@ def get_function_definition(
                             self.get_temporary_var_declarator(codegen_state, tv))
 
                     if tv.initializer is not None:
-                        decl = Initializer(decl, generate_array_literal(
+                        init_decl = Initializer(decl, generate_array_literal(
                             codegen_state, tv, tv.initializer))
+                    else:
+                        init_decl = decl
+
+                    result.append(init_decl)
 
-                    result.append(decl)
+        assert isinstance(function_decl, FunctionDeclarationWrapper)
+        if not isinstance(function_body, Block):
+            function_body = Block([function_body])
 
         fbody = FunctionBody(function_decl, function_body)
+
         if not result:
             return fbody
         else:
-            return Collection(result+[Line(), fbody])
+            return Collection([*result, Line(), fbody])
 
     def get_function_declaration(
             self, codegen_state: CodeGenerationState,
             codegen_result: CodeGenerationResult, schedule_index: int
-            ) -> Tuple[Sequence[Tuple[str, str]], Generable]:
+            ) -> tuple[Sequence[tuple[str, str]], Generable]:
         kernel = codegen_state.kernel
 
         assert codegen_state.kernel.linearization is not None
         subkernel_name = cast(
-                        CallKernel,
+                        "CallKernel",
                         codegen_state.kernel.linearization[schedule_index]
                         ).kernel_name
 
         from cgen import FunctionDeclaration, Value
 
-        name = codegen_result.current_program(codegen_state).name
+        name_str = codegen_result.current_program(codegen_state).name
         if self.target.fortran_abi:
-            name += "_"
+            name_str += "_"
 
         if codegen_state.is_entrypoint:
-            name = Value("void", name)
+            name: Declarator = Value("void", name_str)
 
             # subkernel launches occur only as part of entrypoint kernels for now
             from loopy.schedule.tools import get_subkernel_arg_info
@@ -866,7 +877,7 @@ def get_function_declaration(
             passed_names = skai.passed_names
             written_names = skai.written_names
         else:
-            name = Value("static void", name)
+            name = Value("static void", name_str)
             passed_names = [arg.name for arg in kernel.args]
             written_names = kernel.get_written_variables()
 
@@ -880,8 +891,8 @@ def get_function_declaration(
 
     def get_kernel_call(self, codegen_state: CodeGenerationState,
             subkernel_name: str,
-            gsize: Tuple[ExpressionT, ...],
-            lsize: Tuple[ExpressionT, ...]) -> Optional[Generable]:
+            gsize: tuple[Expression, ...],
+            lsize: tuple[Expression, ...]) -> Generable | None:
         return None
 
     def emit_temp_var_decl_for_tv_with_base_storage(self,
@@ -895,11 +906,11 @@ def emit_temp_var_decl_for_tv_with_base_storage(self,
         assert isinstance(tv.address_space, AddressSpace)
         ecm = codegen_state.expression_to_code_mapper
 
-        cast_decl = POD(self, tv.dtype, "")
-        temp_var_decl = POD(self, tv.dtype, tv.name)
+        cast_decl: Declarator = POD(self, tv.dtype, "")
+        temp_var_decl: Declarator = POD(self, tv.dtype, tv.name)
 
         if tv._base_storage_access_may_be_aliasing:
-            ptrtype = _ConstPointer
+            ptrtype: type[Pointer] = _ConstPointer
         else:
             # The 'restrict' part of this is a complete lie--of course
             # all these temporaries are aliased. But we're promising to
@@ -948,8 +959,6 @@ def get_temporary_decls(self, codegen_state, schedule_index):
                 sub_knl_temps
                 | supporting_temporary_names(kernel, sub_knl_temps))
 
-        ecm = self.get_expression_to_code_mapper(codegen_state)
-
         for tv_name in sorted(sub_knl_temps):
             tv = kernel.temporary_variables[tv_name]
             if not tv.base_storage:
@@ -1021,7 +1030,7 @@ def wrap_global_constant(self, decl: Declarator) -> Declarator:
 
     def get_value_arg_declaraotor(
             self, name: str, dtype: LoopyType, is_written: bool) -> Declarator:
-        result = POD(self, dtype, name)
+        result: Declarator = POD(self, dtype, name)
 
         if not is_written:
             from cgen import Const
@@ -1051,7 +1060,7 @@ def get_array_base_declarator(self, ary: ArrayBase) -> Declarator:
     def get_array_arg_declarator(
             self, arg: ArrayArg, is_written: bool) -> Declarator:
         from cgen import RestrictPointer
-        arg_decl = RestrictPointer(
+        arg_decl: Declarator = RestrictPointer(
                 self.wrap_decl_for_address_space(
                     self.get_array_base_declarator(arg), arg.address_space))
 
@@ -1073,10 +1082,10 @@ def get_temporary_arg_decl(
             from cgen import RestrictPointer
             assert temp_var.address_space is not auto
 
-            arg_decl = RestrictPointer(
+            arg_decl: Declarator = RestrictPointer(
                     self.wrap_decl_for_address_space(
                         self.get_array_base_declarator(temp_var),
-                        cast(AddressSpace, temp_var.address_space)))
+                        cast("AddressSpace", temp_var.address_space)))
             if not is_written:
                 arg_decl = Const(arg_decl)
 
@@ -1281,7 +1290,7 @@ def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
                 inner)
 
         if hints:
-            return Collection(list(hints) + [loop])
+            return Collection([*list(hints), loop])
         else:
             return loop
 
@@ -1339,8 +1348,7 @@ def map_expression(self, expr):
 
     def map_function_decl_wrapper(self, node):
         self.decls.append(node.subdecl)
-        return super()\
-                .map_function_decl_wrapper(node)
+        return super().map_function_decl_wrapper(node)
 
 
 def generate_header(kernel, codegen_result=None):
@@ -1397,9 +1405,7 @@ def get_dtype_registry(self):
 class CASTBuilder(CFamilyASTBuilder):
     def preamble_generators(self):
         return (
-                super().preamble_generators() + [
-                    c99_preamble_generator,
-                    ])
+                [*super().preamble_generators(), c99_preamble_generator])
 
 # }}}
 
diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py
index 9cde501a7..270c3d0dc 100644
--- a/loopy/target/c/c_execution.py
+++ b/loopy/target/c/c_execution.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2017 Nick Curtis"
 
 __license__ = """
@@ -25,30 +28,34 @@
 import os
 import tempfile
 from dataclasses import dataclass
-from typing import Any, Callable, Optional, Sequence, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, Sequence
 
 import numpy as np
 from codepy.jit import compile_from_string
 from codepy.toolchain import GCCToolchain, ToolchainGuessError, guess_toolchain
-from immutables import Map
 
 from pytools import memoize_method
 from pytools.codegen import CodeGenerator, Indentation
 from pytools.prefork import ExecError
 
-from loopy.codegen.result import GeneratedProgram
-from loopy.kernel import LoopKernel
 from loopy.kernel.array import ArrayBase
-from loopy.kernel.data import ArrayArg
-from loopy.schedule.tools import KernelArgInfo
 from loopy.target.execution import (
     ExecutionWrapperGeneratorBase,
     ExecutorBase,
     get_highlighted_code,
 )
-from loopy.translation_unit import TranslationUnit
 from loopy.types import LoopyType
-from loopy.typing import ExpressionT
+
+
+if TYPE_CHECKING:
+    from immutables import Map
+
+    from loopy.codegen.result import GeneratedProgram
+    from loopy.kernel import LoopKernel
+    from loopy.kernel.data import ArrayArg
+    from loopy.schedule.tools import KernelArgInfo
+    from loopy.translation_unit import TranslationUnit
+    from loopy.typing import Expression
 
 
 logger = logging.getLogger(__name__)
@@ -105,7 +112,7 @@ def handle_non_numpy_arg(self, gen, arg):
 
     def handle_alloc(
             self, gen: CodeGenerator, arg: ArrayArg,
-            strify: Callable[[Union[ExpressionT, Tuple[ExpressionT]]], str],
+            strify: Callable[[Expression | tuple[Expression]], str],
             skip_arg_checks: bool) -> None:
         """
         Handle allocation of non-specified arguments for C-execution
@@ -324,7 +331,7 @@ def build(self, name, code, debug=False, wait_on_error=None,
         c_fname = self._tempname("code." + self.source_suffix)
 
         # build object
-        _, mod_name, ext_file, recompiled = \
+        _, _mod_name, ext_file, recompiled = \
             compile_from_string(
                 self.toolchain.copy(
                     cflags=self.toolchain.cflags+list(extra_build_options)),
@@ -365,15 +372,15 @@ def __init__(self, toolchain=None,
 # {{{ placeholder till ctypes fixes: https://github.com/python/cpython/issues/61103
 
 class Complex64(ctypes.Structure):
-    _fields_ = [("real", ctypes.c_float), ("imag", ctypes.c_float)]
+    _fields_: ClassVar = [("real", ctypes.c_float), ("imag", ctypes.c_float)]
 
 
 class Complex128(ctypes.Structure):
-    _fields_ = [("real", ctypes.c_double), ("imag", ctypes.c_double)]
+    _fields_: ClassVar = [("real", ctypes.c_double), ("imag", ctypes.c_double)]
 
 
 class Complex256(ctypes.Structure):
-    _fields_ = [("real", ctypes.c_longdouble), ("imag", ctypes.c_longdouble)]
+    _fields_: ClassVar = [("real", ctypes.c_longdouble), ("imag", ctypes.c_longdouble)]
 
 
 _NUMPY_COMPLEX_TYPE_TO_CTYPE = {
@@ -425,7 +432,7 @@ class CompiledCKernel:
 
     def __init__(self, kernel: LoopKernel, devprog: GeneratedProgram,
             passed_names: Sequence[str], dev_code: str,
-            comp: Optional["CCompiler"] = None):
+            comp: CCompiler | None = None):
         # get code and build
         self.code = dev_code
         self.comp = comp if comp is not None else CCompiler()
@@ -473,7 +480,7 @@ class CExecutor(ExecutorBase):
     .. automethod:: __call__
     """
 
-    def __init__(self, program, entrypoint, compiler: Optional["CCompiler"] = None):
+    def __init__(self, program, entrypoint, compiler: CCompiler | None = None):
         """
         :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
             (a warning will be issued if more than one is returned). If the
@@ -493,7 +500,7 @@ def get_wrapper_generator(self):
 
     @memoize_method
     def translation_unit_info(self,
-            arg_to_dtype: Optional[Map[str, LoopyType]] = None) -> _KernelInfo:
+            arg_to_dtype: Map[str, LoopyType] | None = None) -> _KernelInfo:
         t_unit = self.get_typed_and_scheduled_translation_unit(arg_to_dtype)
 
         from loopy.codegen import generate_code_v2
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 0c15faa58..b1723e9d6 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -21,7 +24,7 @@
 """
 
 
-from typing import Optional
+from typing import TYPE_CHECKING
 
 import numpy as np
 
@@ -44,11 +47,14 @@
 
 from loopy.diagnostic import LoopyError
 from loopy.expression import dtype_to_type_context
-from loopy.symbolic import TypeCast
 from loopy.target.c import CExpression
 from loopy.type_inference import TypeReader
 from loopy.types import LoopyType
-from loopy.typing import ExpressionT, is_integer
+from loopy.typing import Expression, is_integer
+
+
+if TYPE_CHECKING:
+    from loopy.symbolic import TypeCast
 
 
 __doc__ = """
@@ -92,7 +98,7 @@ def with_assignments(self, names_to_vars):
         type_inf_mapper = self.type_inf_mapper.with_assignments(names_to_vars)
         return type(self)(self.codegen_state, self.fortran_abi, type_inf_mapper)
 
-    def infer_type(self, expr: ExpressionT) -> LoopyType:
+    def infer_type(self, expr: Expression) -> LoopyType:
         result = self.type_inf_mapper(expr)
         assert isinstance(result, LoopyType)
 
@@ -123,7 +129,7 @@ def wrap_in_typecast(self, actual_type: LoopyType, needed_type: LoopyType, s):
 
         return s
 
-    def rec(self, expr, type_context=None, needed_type: Optional[LoopyType] = None):  # type: ignore[override]
+    def rec(self, expr, type_context=None, needed_type: LoopyType | None = None):  # type: ignore[override]
         result = super().rec(expr, type_context)
 
         if needed_type is None:
@@ -476,7 +482,7 @@ def map_constant(self, expr, type_context):
 
         elif np.isfinite(expr):
             if type_context == "f":
-                return Literal(repr(float((expr)))+"f")
+                return Literal(repr(float(expr))+"f")
             elif type_context == "d":
                 return Literal(repr(float(expr)))
             elif type_context in ["i", "b"]:
@@ -641,7 +647,7 @@ def map_constant(self, expr, prec):
                 # FIXME: Add type suffixes?
                 return repr(int(expr))
             elif isinstance(expr, np.float32):
-                return f"{repr(float(expr))}f"
+                return f"{float(expr)!r}f"
             elif isinstance(expr, np.float64):
                 return repr(float(expr))
             else:
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index afeb5cee2..50d2ac7fe 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -1,4 +1,5 @@
 """CUDA target independent of PyCUDA."""
+from __future__ import annotations
 
 
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
@@ -23,16 +24,14 @@
 THE SOFTWARE.
 """
 
-from typing import Sequence, Tuple
+from typing import TYPE_CHECKING, Sequence
 
 import numpy as np
 
-from cgen import Const, Declarator, Generable
+from cgen import Const, Declarator, Generable, Pointer
 from pymbolic import var
 from pytools import memoize_method
 
-from loopy.codegen import CodeGenerationState
-from loopy.codegen.result import CodeGenerationResult
 from loopy.diagnostic import LoopyError, LoopyTypeError
 from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag, VectorArrayDimTag
 from loopy.kernel.data import (
@@ -48,6 +47,11 @@
 from loopy.types import NumpyType
 
 
+if TYPE_CHECKING:
+    from loopy.codegen import CodeGenerationState
+    from loopy.codegen.result import CodeGenerationResult
+
+
 # {{{ vector types
 
 class vec:  # noqa
@@ -186,7 +190,7 @@ def cuda_with_types(self, arg_id_to_dtype, callables_table):
 
             input_dtype = arg_id_to_dtype[0]
 
-            scalar_dtype, offset, field_name = input_dtype.fields["x"]
+            scalar_dtype, _offset, _field_name = input_dtype.fields["x"]
             return_dtype = scalar_dtype
             return self.copy(arg_id_to_dtype={0: input_dtype, 1: input_dtype,
                                               -1: return_dtype})
@@ -332,7 +336,7 @@ def known_callables(self):
     def get_function_declaration(
             self, codegen_state: CodeGenerationState,
             codegen_result: CodeGenerationResult, schedule_index: int
-            ) -> Tuple[Sequence[Tuple[str, str]], Generable]:
+            ) -> tuple[Sequence[tuple[str, str]], Generable]:
         preambles, fdecl = super().get_function_declaration(
                 codegen_state, codegen_result, schedule_index)
 
@@ -369,8 +373,7 @@ def get_function_declaration(
     def preamble_generators(self):
 
         return (
-                super().preamble_generators() + [
-                    cuda_preamble_generator])
+                [*super().preamble_generators(), cuda_preamble_generator])
 
     # }}}
 
@@ -449,7 +452,7 @@ def get_array_base_declarator(self, ary: ArrayBase) -> Declarator:
     def get_array_arg_declarator(
             self, arg: ArrayArg, is_written: bool) -> Declarator:
         from cgen.cuda import CudaRestrictPointer
-        arg_decl = CudaRestrictPointer(
+        arg_decl: Declarator = CudaRestrictPointer(
                     self.get_array_base_declarator(arg))
 
         if not is_written:
@@ -478,11 +481,11 @@ def emit_temp_var_decl_for_tv_with_base_storage(self,
         assert tv.base_storage is not None
         ecm = codegen_state.expression_to_code_mapper
 
-        cast_decl = POD(self, tv.dtype, "")
-        temp_var_decl = POD(self, tv.dtype, tv.name)
+        cast_decl: Declarator = POD(self, tv.dtype, "")
+        temp_var_decl: Declarator = POD(self, tv.dtype, tv.name)
 
         if tv._base_storage_access_may_be_aliasing:
-            ptrtype = _ConstPointer
+            ptrtype: type[Pointer] = _ConstPointer
         else:
             # The 'restrict' part of this is a complete lie--of course
             # all these temporaries are aliased. But we're promising to
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index 2443a1420..cb737f95a 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012-17 Andreas Kloeckner, Nick Curtis"
 
 __license__ = """
@@ -25,17 +28,11 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
-    Dict,
-    FrozenSet,
-    List,
     Mapping,
-    Optional,
     Sequence,
-    Set,
-    Tuple,
-    Union,
     cast,
 )
 
@@ -54,14 +51,17 @@
 
 from loopy.kernel import KernelState, LoopKernel
 from loopy.kernel.data import ArrayArg, _ArraySeparationInfo, auto
-from loopy.schedule.tools import KernelArgInfo
 from loopy.tools import LoopyKeyBuilder, caches
-from loopy.translation_unit import TranslationUnit
 from loopy.types import LoopyType, NumpyType
-from loopy.typing import ExpressionT, integer_expr_or_err
+from loopy.typing import Expression, integer_expr_or_err
 from loopy.version import DATA_MODEL_VERSION
 
 
+if TYPE_CHECKING:
+    from loopy.schedule.tools import KernelArgInfo
+    from loopy.translation_unit import TranslationUnit
+
+
 # {{{ object array argument packing
 
 class SeparateArrayPackingController:
@@ -74,10 +74,10 @@ class SeparateArrayPackingController:
     It also repacks outgoing arrays of this type back into an object array.
     """
 
-    def __init__(self, packing_info: Dict[str, _ArraySeparationInfo]) -> None:
+    def __init__(self, packing_info: dict[str, _ArraySeparationInfo]) -> None:
         # These must work to index tuples if 1D.
         def untuple_length_1_indices(
-                ind: Tuple[int, ...]) -> Union[int, Tuple[int, ...]]:
+                ind: tuple[int, ...]) -> int | tuple[int, ...]:
             if len(ind) == 1:
                 return ind[0]
             else:
@@ -91,7 +91,7 @@ def untuple_length_1_indices(
                 for name, sep_info in packing_info.items()
                 }
 
-    def __call__(self, kernel_kwargs: Dict[str, Any]) -> Dict[str, Any]:
+    def __call__(self, kernel_kwargs: dict[str, Any]) -> dict[str, Any]:
         kernel_kwargs = kernel_kwargs.copy()
 
         for name, ind_to_subary_name in self.packing_info.items():
@@ -109,7 +109,7 @@ def __call__(self, kernel_kwargs: Dict[str, Any]) -> Dict[str, Any]:
 
 # {{{ ExecutionWrapperGeneratorBase
 
-def _str_to_expr(name_or_expr: Union[str, ExpressionT]) -> ExpressionT:
+def _str_to_expr(name_or_expr: str | Expression) -> Expression:
     if isinstance(name_or_expr, str):
         return var(name_or_expr)
     else:
@@ -118,14 +118,14 @@ def _str_to_expr(name_or_expr: Union[str, ExpressionT]) -> ExpressionT:
 
 @dataclass(frozen=True)
 class _ArgFindingEquation:
-    lhs: ExpressionT
-    rhs: ExpressionT
+    lhs: Expression
+    rhs: Expression
 
     # Arg finding code is sorted by priority, all equations (across all unknowns)
     # of lowest priority first.
     order: int
 
-    based_on_names: FrozenSet[str]
+    based_on_names: frozenset[str]
 
 
 class ExecutionWrapperGeneratorBase(ABC):
@@ -172,11 +172,11 @@ def generate_integer_arg_finding_from_array_data(
         from loopy.kernel.array import get_strides
         from loopy.kernel.data import ArrayArg
         from loopy.symbolic import DependencyMapper, StringifyMapper
-        dep_map = DependencyMapper()
+        dep_map: DependencyMapper[[]] = DependencyMapper()
 
         # {{{ find equations
 
-        equations: List[_ArgFindingEquation] = []
+        equations: list[_ArgFindingEquation] = []
 
         for arg_name in kai.passed_arg_names:
             arg = kernel.arg_dict[arg_name]
@@ -252,7 +252,7 @@ def generate_integer_arg_finding_from_array_data(
         # {{{ regroup equations by unknown
 
         order_to_unknown_to_equations: \
-                Dict[int, Dict[str, List[_ArgFindingEquation]]] = {}
+                dict[int, dict[str, list[_ArgFindingEquation]]] = {}
 
         for eqn in equations:
             deps = dep_map(eqn.rhs)
@@ -261,8 +261,8 @@ def generate_integer_arg_finding_from_array_data(
                 unknown_var, = deps
                 order_to_unknown_to_equations \
                         .setdefault(eqn.order, {}) \
-                        .setdefault(cast(Variable, unknown_var).name, []) \
-                        .append((eqn))
+                        .setdefault(cast("Variable", unknown_var).name, []) \
+                        .append(eqn)
             else:
                 # Zero deps: nothing to determine, forget about it.
                 # 2+ deps: not implemented
@@ -287,7 +287,7 @@ def generate_integer_arg_finding_from_array_data(
                         key=lambda eqn: eqn.order)
                 subgen = CodeGenerator()
 
-                seen_based_on_names: Set[FrozenSet[str]] = set()
+                seen_based_on_names: set[frozenset[str]] = set()
 
                 if_or_elif = "if"
 
@@ -389,7 +389,7 @@ def handle_non_numpy_arg(self, gen: CodeGenerator, arg):
 
     def handle_alloc(
             self, gen: CodeGenerator, arg: ArrayArg,
-            strify: Callable[[Union[ExpressionT, Tuple[ExpressionT]]], str],
+            strify: Callable[[Expression | tuple[Expression]], str],
             skip_arg_checks: bool) -> None:
         """
         Handle allocation of non-specified arguments for C-execution
@@ -534,7 +534,7 @@ def strify_allowing_none(shape_axis):
                         else:
                             return strify(shape_axis)
 
-                    def strify_tuple(t: Optional[Tuple[ExpressionT, ...]]) -> str:
+                    def strify_tuple(t: tuple[Expression, ...] | None) -> str:
                         if t is None:
                             return "None"
                         if len(t) == 0:
@@ -735,7 +735,7 @@ def __call__(self, program, entrypoint, codegen_result):
 
 
 typed_and_scheduled_cache: WriteOncePersistentDict[
-    Tuple[str, TranslationUnit, Optional[Mapping[str, LoopyType]]],
+    tuple[str, TranslationUnit, Mapping[str, LoopyType] | None],
     TranslationUnit
 ] = WriteOncePersistentDict(
         "loopy-typed-and-scheduled-cache-v1-"+DATA_MODEL_VERSION,
@@ -747,7 +747,7 @@ def __call__(self, program, entrypoint, codegen_result):
 
 
 invoker_cache: WriteOncePersistentDict[
-    Tuple[str, TranslationUnit, str],
+    tuple[str, TranslationUnit, str],
     str
 ] = WriteOncePersistentDict(
         "loopy-invoker-cache-v10-"+DATA_MODEL_VERSION,
@@ -767,7 +767,7 @@ class ExecutorBase:
 
     .. automethod:: __call__
     """
-    packing_controller: Optional[SeparateArrayPackingController]
+    packing_controller: SeparateArrayPackingController | None
 
     def __init__(self, t_unit: TranslationUnit, entrypoint: str):
         self.t_unit = t_unit
@@ -817,7 +817,7 @@ def check_for_required_array_arguments(self, input_args):
                 "your argument.")
 
     def get_typed_and_scheduled_translation_unit_uncached(
-            self, arg_to_dtype: Optional[Map[str, LoopyType]]
+            self, arg_to_dtype: Map[str, LoopyType] | None
             ) -> TranslationUnit:
         t_unit = self.t_unit
 
@@ -854,7 +854,7 @@ def get_typed_and_scheduled_translation_unit_uncached(
         return t_unit
 
     def get_typed_and_scheduled_translation_unit(
-            self, arg_to_dtype: Optional[Map[str, LoopyType]]
+            self, arg_to_dtype: Map[str, LoopyType] | None
             ) -> TranslationUnit:
         from loopy import CACHING_ENABLED
 
@@ -876,7 +876,7 @@ def get_typed_and_scheduled_translation_unit(
 
         return t_unit
 
-    def arg_to_dtype(self, kwargs) -> Optional[Map[str, LoopyType]]:
+    def arg_to_dtype(self, kwargs) -> Map[str, LoopyType] | None:
         if not self.has_runtime_typed_args:
             return None
 
@@ -904,7 +904,7 @@ def get_highlighted_code(self, entrypoint, arg_to_dtype=None, code=None):
 
     def get_code(
             self, entrypoint: str,
-            arg_to_dtype: Optional[Map[str, LoopyType]] = None) -> str:
+            arg_to_dtype: Map[str, LoopyType] | None = None) -> str:
         kernel = self.get_typed_and_scheduled_translation_unit(arg_to_dtype)
 
         from loopy.codegen import generate_code_v2
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 1cd7a5bd2..e493ee3e9 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -1,4 +1,5 @@
 """Target for Intel ISPC."""
+from __future__ import annotations
 
 
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
@@ -24,9 +25,9 @@
 """
 
 
-from typing import Sequence, Tuple, cast
+from typing import TYPE_CHECKING, Sequence, cast
 
-import numpy as np  # noqa
+import numpy as np
 
 import pymbolic.primitives as p
 from cgen import Collection, Const, Declarator, Generable
@@ -34,16 +35,19 @@
 from pymbolic.mapper.stringifier import PREC_NONE
 from pytools import memoize_method
 
-from loopy.codegen import CodeGenerationState
-from loopy.codegen.result import CodeGenerationResult
 from loopy.diagnostic import LoopyError
 from loopy.kernel.data import AddressSpace, ArrayArg, TemporaryVariable
-from loopy.schedule import CallKernel
 from loopy.symbolic import Literal
 from loopy.target.c import CFamilyASTBuilder, CFamilyTarget
 from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
-from loopy.types import LoopyType
-from loopy.typing import ExpressionT
+
+
+if TYPE_CHECKING:
+    from loopy.codegen import CodeGenerationState
+    from loopy.codegen.result import CodeGenerationResult
+    from loopy.schedule import CallKernel
+    from loopy.types import LoopyType
+    from loopy.typing import Expression
 
 
 # {{{ expression mapper
@@ -114,7 +118,7 @@ def map_subscript(self, expr, type_context):
                 and ary.address_space == AddressSpace.PRIVATE):
             # generate access code for access to private-index temporaries
 
-            gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs()
+            _gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs()
             if lsize:
                 lsize, = lsize
                 from pymbolic import evaluate
@@ -174,7 +178,7 @@ class ISPCTarget(CFamilyTarget):
     device_program_name_suffix = "_inner"
 
     def pre_codegen_entrypoint_check(self, kernel, callables_table):
-        gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs(
+        _gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs(
                 callables_table)
         if len(lsize) > 1:
             for ls_i in lsize[1:]:
@@ -208,13 +212,13 @@ class ISPCASTBuilder(CFamilyASTBuilder):
     def get_function_declaration(
             self, codegen_state: CodeGenerationState,
             codegen_result: CodeGenerationResult, schedule_index: int
-            ) -> Tuple[Sequence[Tuple[str, str]], Generable]:
+            ) -> tuple[Sequence[tuple[str, str]], Generable]:
         name = codegen_result.current_program(codegen_state).name
         kernel = codegen_state.kernel
 
         assert codegen_state.kernel.linearization is not None
         subkernel_name = cast(
-                        CallKernel,
+                        "CallKernel",
                         codegen_state.kernel.linearization[schedule_index]
                         ).kernel_name
 
@@ -237,7 +241,7 @@ def get_function_declaration(
                         for arg_name in passed_names]
 
         if codegen_state.is_generating_device_code:
-            result = ISPCTask(
+            result: Declarator = ISPCTask(
                         FunctionDeclaration(
                             Value("void", name),
                             arg_decls))
@@ -252,8 +256,8 @@ def get_function_declaration(
 
     def get_kernel_call(self, codegen_state: CodeGenerationState,
             subkernel_name: str,
-            gsize: Tuple[ExpressionT, ...],
-            lsize: Tuple[ExpressionT, ...]) -> Generable:
+            gsize: tuple[Expression, ...],
+            lsize: tuple[Expression, ...]) -> Generable:
         kernel = codegen_state.kernel
         ecm = self.get_expression_to_code_mapper(codegen_state)
 
@@ -323,7 +327,7 @@ def get_array_arg_declarator(
             self, arg: ArrayArg, is_written: bool) -> Declarator:
         # FIXME restrict?
         from cgen.ispc import ISPCUniform, ISPCUniformPointer
-        decl = ISPCUniform(
+        decl: Declarator = ISPCUniform(
                 ISPCUniformPointer(self.get_array_base_declarator(arg)))
 
         if not is_written:
@@ -499,7 +503,7 @@ def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
                 inner)
 
         if hints:
-            return Collection(list(hints) + [loop])
+            return Collection([*list(hints), loop])
         else:
             return loop
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 14383e54f..d14dd9e30 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -1,4 +1,5 @@
 """OpenCL target independent of PyOpenCL."""
+from __future__ import annotations
 
 
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
@@ -23,16 +24,13 @@
 THE SOFTWARE.
 """
 
-from typing import Sequence, Tuple
+from typing import TYPE_CHECKING, Literal, Sequence
 
 import numpy as np
 
-from cgen import Declarator, Generable
 from pymbolic import var
 from pytools import memoize_method
 
-from loopy.codegen import CodeGenerationState
-from loopy.codegen.result import CodeGenerationResult
 from loopy.diagnostic import LoopyError, LoopyTypeError
 from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag, VectorArrayDimTag
 from loopy.kernel.data import AddressSpace, ConstantArg, ImageArg
@@ -42,6 +40,13 @@
 from loopy.types import NumpyType
 
 
+if TYPE_CHECKING:
+    from cgen import Declarator, Generable
+
+    from loopy.codegen import CodeGenerationState
+    from loopy.codegen.result import CodeGenerationResult
+
+
 # {{{ dtype registry wrappers
 
 
@@ -321,7 +326,7 @@ def with_types(self, arg_id_to_dtype, callables_table):
                         callables_table)
 
             dtype = arg_id_to_dtype[0]
-            scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"]
+            scalar_dtype, _offset, _field_name = dtype.numpy_dtype.fields["s0"]
             return (
                     self.copy(name_in_target=name, arg_id_to_dtype={-1:
                         NumpyType(scalar_dtype), 0: dtype, 1: dtype}),
@@ -618,15 +623,12 @@ def known_callables(self):
 
     def symbol_manglers(self):
         return (
-                super().symbol_manglers() + [
-                    opencl_symbol_mangler
-                    ])
+                [*super().symbol_manglers(), opencl_symbol_mangler])
 
     def preamble_generators(self):
 
         return (
-                super().preamble_generators() + [
-                    opencl_preamble_generator])
+                [*super().preamble_generators(), opencl_preamble_generator])
 
     # }}}
 
@@ -635,7 +637,7 @@ def preamble_generators(self):
     def get_function_declaration(
             self, codegen_state: CodeGenerationState,
             codegen_result: CodeGenerationResult, schedule_index: int
-            ) -> Tuple[Sequence[Tuple[str, str]], Generable]:
+            ) -> tuple[Sequence[tuple[str, str]], Generable]:
         preambles, fdecl = super().get_function_declaration(
                 codegen_state, codegen_result, schedule_index)
 
@@ -764,12 +766,9 @@ def get_constant_arg_declarator(self, arg: ConstantArg) -> Declarator:
 
     def get_image_arg_declarator(
             self, arg: ImageArg, is_written: bool) -> Declarator:
-        if is_written:
-            mode = "w"
-        else:
-            mode = "r"
-
         from cgen.opencl import CLImage
+
+        mode: Literal["r", "w"] = "w" if is_written else "r"
         return CLImage(arg.num_target_axes(), mode, arg.name)
 
     # }}}
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index e4da6cd8b..9add453d7 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -26,17 +26,17 @@
 """
 
 import logging
-from typing import TYPE_CHECKING, Any, List, Optional, Sequence, Tuple, Union, cast
+from typing import TYPE_CHECKING, Any, Sequence, cast
 from warnings import warn
 
 import numpy as np
 
-import genpy
 import pymbolic.primitives as p
 from cgen import (
     Block,
     Collection,
     Const,
+    Declarator,
     FunctionBody,
     Generable,
     Initializer,
@@ -45,10 +45,7 @@
 )
 from cgen.opencl import CLGlobal
 
-from loopy.codegen import CodeGenerationState
-from loopy.codegen.result import CodeGenerationResult
 from loopy.diagnostic import LoopyError, LoopyTypeError
-from loopy.kernel import LoopKernel
 from loopy.kernel.data import (
     ArrayArg,
     ConstantArg,
@@ -63,18 +60,23 @@
     OpenCLCASTBuilder,
     OpenCLTarget,
 )
-from loopy.target.pyopencl_execution import PyOpenCLExecutor
 from loopy.target.python import PythonASTBuilderBase
-from loopy.translation_unit import FunctionIdT, TranslationUnit
 from loopy.types import NumpyType
-from loopy.typing import ExpressionT
 
 
 logger = logging.getLogger(__name__)
 
 if TYPE_CHECKING:
+    import genpy
     import pyopencl as cl
 
+    from loopy.codegen import CodeGenerationState
+    from loopy.codegen.result import CodeGenerationResult
+    from loopy.kernel import LoopKernel
+    from loopy.target.pyopencl_execution import PyOpenCLExecutor
+    from loopy.translation_unit import FunctionIdT, TranslationUnit
+    from loopy.typing import Expression
+
 
 # {{{ pyopencl function scopers
 
@@ -506,23 +508,23 @@ class PyOpenCLTarget(OpenCLTarget):
     """
 
     # FIXME make prefixes conform to naming rules
-    # (see Reference: Loopy’s Model of a Kernel)
+    # (see Reference: Loopy's Model of a Kernel)
 
     host_program_name_prefix = "_lpy_host_"
     host_program_name_suffix = ""
 
     # FIXME Not yet complete
-    limit_arg_size_nbytes: Optional[int]
+    limit_arg_size_nbytes: int | None
     pointer_size_nbytes: int
 
     def __init__(
             self, device=None, *, pyopencl_module_name: str = "_lpy_cl",
             atomics_flavor=None, use_int8_for_bool: bool = True,
-            limit_arg_size_nbytes: Optional[int] = None,
-            pointer_size_nbytes: Optional[int] = None
+            limit_arg_size_nbytes: int | None = None,
+            pointer_size_nbytes: int | None = None
             ) -> None:
         # This ensures the dtype registry is populated.
-        import pyopencl.tools  # noqa
+        import pyopencl.tools
 
         super().__init__(
             atomics_flavor=atomics_flavor,
@@ -553,10 +555,8 @@ def device(self):
         return None
 
     # NB: Not including 'device', as that is handled specially here.
-    hash_fields = OpenCLTarget.hash_fields + (
-            "pyopencl_module_name",)
-    comparison_fields = OpenCLTarget.comparison_fields + (
-            "pyopencl_module_name",)
+    hash_fields = (*OpenCLTarget.hash_fields, "pyopencl_module_name")
+    comparison_fields = (*OpenCLTarget.comparison_fields, "pyopencl_module_name")
 
     def get_host_ast_builder(self):
         return PyOpenCLPythonASTBuilder(self)
@@ -621,7 +621,7 @@ def get_kernel_executor_cache_key(self, queue, **kwargs):
     # type-ignore because we're making things from *args: Any more concrete,
     # and mypy doesn't like it.
     def get_kernel_executor(self, t_unit: TranslationUnit,  # type: ignore[override]
-                            queue_or_context: Union[cl.CommandQueue, cl.Context],
+                            queue_or_context: cl.CommandQueue | cl.Context,
                             *args: Any, entrypoint: FunctionIdT, **kwargs: Any
                             ) -> PyOpenCLExecutor:
         from pyopencl import CommandQueue
@@ -648,7 +648,7 @@ def generate_value_arg_setup(
     import loopy as lp
     from loopy.kernel.array import ArrayBase
 
-    result: List[genpy.Generable] = []
+    result: list[genpy.Generable] = []
     gen = result.append
 
     buf_indices_and_args = []
@@ -733,10 +733,10 @@ def generate_array_arg_setup(
 
     from loopy.kernel.array import ArrayBase
 
-    result: List[genpy.Generable] = []
+    result: list[genpy.Generable] = []
     gen = result.append
 
-    cl_indices_and_args: List[Union[int, str]] = []
+    cl_indices_and_args: list[int | str] = []
     for arg_idx, passed_name in enumerate(passed_names):
         if passed_name in kernel.all_inames():
             continue
@@ -774,9 +774,8 @@ def get_function_definition(
         kai = get_kernel_arg_info(codegen_state.kernel)
 
         args = (
-                ["_lpy_cl_kernels", "queue"]
-                + list(kai.passed_arg_names)
-                + ["wait_for=None", "allocator=None"])
+                ["_lpy_cl_kernels", "queue", *kai.passed_arg_names,
+                    "wait_for=None", "allocator=None"])
 
         from genpy import For, Function, Line, Return, Statement as S, Suite
         return Function(
@@ -803,7 +802,7 @@ def get_function_definition(
     def get_function_declaration(
             self, codegen_state: CodeGenerationState,
             codegen_result: CodeGenerationResult, schedule_index: int
-            ) -> Tuple[Sequence[Tuple[str, str]], Optional[genpy.Generable]]:
+            ) -> tuple[Sequence[tuple[str, str]], genpy.Generable | None]:
         # no such thing in Python
         return [], None
 
@@ -855,7 +854,7 @@ def get_temporary_decls(self, codegen_state, schedule_index):
     def get_kernel_call(
             self, codegen_state: CodeGenerationState,
             subkernel_name: str,
-            gsize: Tuple[ExpressionT, ...], lsize: Tuple[ExpressionT, ...]
+            gsize: tuple[Expression, ...], lsize: tuple[Expression, ...]
             ) -> genpy.Suite:
         from genpy import Assert, Assign, Comment, Line, Suite
 
@@ -920,7 +919,7 @@ def get_kernel_call(
                     "_lpy_cl.mem_flags.READ_ONLY "
                     "| _lpy_cl.mem_flags.COPY_HOST_PTR, "
                     "hostbuf="
-                    f"_lpy_pack({repr(''.join(struct_pack_types))}, "
+                    f"_lpy_pack({''.join(struct_pack_types)!r}, "
                     f"{', '.join(struct_pack_args)}))"),
                 Line(f"_lpy_knl.set_arg({cl_arg_count}, _lpy_overflow_args_buf)")
                 ])
@@ -980,8 +979,8 @@ def get_kernel_call(
 
 def split_args_for_overflow(
         kernel: LoopKernel, passed_names: Sequence[str],
-        *, limit_arg_size_nbytes: Optional[int], pointer_size_nbytes: int
-        ) -> Tuple[Sequence[str], Sequence[str]]:
+        *, limit_arg_size_nbytes: int | None, pointer_size_nbytes: int
+        ) -> tuple[Sequence[str], Sequence[str]]:
     if limit_arg_size_nbytes is None:
         return passed_names, []
 
@@ -1027,15 +1026,18 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder):
     # {{{ function decl/def, with arg overflow handling
 
     def get_function_definition(
-            self, codegen_state: CodeGenerationState,
+            self,
+            codegen_state: CodeGenerationState,
             codegen_result: CodeGenerationResult,
-            schedule_index: int, function_decl: Generable, function_body: Generable,
-            ) -> Tuple[Sequence[Tuple[str, str]], Generable]:
+            schedule_index: int,
+            function_decl: Generable,
+            function_body: Generable,
+            ) -> Generable:
         assert isinstance(function_body, Block)
         kernel = codegen_state.kernel
         assert kernel.linearization is not None
 
-        subkernel_name = cast(CallKernel,
+        subkernel_name = cast("CallKernel",
                 kernel.linearization[schedule_index]).kernel_name
 
         result = []
@@ -1063,10 +1065,12 @@ def get_function_definition(
 
                     if tv.initializer is not None:
                         from loopy.target.c import generate_array_literal
-                        decl = Initializer(decl, generate_array_literal(
+                        init_decl = Initializer(decl, generate_array_literal(
                             codegen_state, tv, tv.initializer))
+                    else:
+                        init_decl = decl
 
-                    result.append(decl)
+                    result.append(init_decl)
 
         # {{{ unpack overflow args
 
@@ -1092,34 +1096,40 @@ def get_function_definition(
 
         # }}}
 
+        from loopy.target.c import FunctionDeclarationWrapper
+
+        assert isinstance(function_decl, FunctionDeclarationWrapper)
+        if not isinstance(function_body, Block):
+            function_body = Block([function_body])
+
         fbody = FunctionBody(function_decl, function_body)
         if not result:
             return fbody
         else:
-            return Collection(result+[Line(), fbody])
+            return Collection([*result, Line(), fbody])
 
     def get_function_declaration(
             self, codegen_state: CodeGenerationState,
             codegen_result: CodeGenerationResult, schedule_index: int
-            ) -> Tuple[Sequence[Tuple[str, str]], Generable]:
+            ) -> tuple[Sequence[tuple[str, str]], Generable]:
         kernel = codegen_state.kernel
 
         assert codegen_state.kernel.linearization is not None
         subkernel_name = cast(
-                        CallKernel,
+                        "CallKernel",
                         codegen_state.kernel.linearization[schedule_index]
                         ).kernel_name
 
         from cgen import FunctionDeclaration, Struct, Value
 
-        name = codegen_result.current_program(codegen_state).name
+        name_str = codegen_result.current_program(codegen_state).name
         if self.target.fortran_abi:
-            name += "_"
+            name_str += "_"
 
         from loopy.target.c import FunctionDeclarationWrapper
 
         if codegen_state.is_entrypoint:
-            name = Value("void", name)
+            name = Value("void", name_str)
 
             # subkernel launches occur only as part of entrypoint kernels for now
             from loopy.schedule.tools import get_subkernel_arg_info
@@ -1149,7 +1159,7 @@ def get_function_declaration(
                         (f"declare-{arg_overflow_struct_name}",
                             str(arg_overflow_struct))
                         ] if struct_overflow_arg_names else []
-                arg_struct_args = [CLGlobal(Const(Pointer(Value(
+                arg_struct_args: list[Declarator] = [CLGlobal(Const(Pointer(Value(
                                 f"struct {arg_overflow_struct_name}",
                                 "_lpy_overflow_args"))))]
             else:
@@ -1168,7 +1178,7 @@ def get_function_declaration(
                             + arg_struct_args
                             )))
         else:
-            name = Value("static void", name)
+            name = Value("static void", name_str)
             passed_names = [arg.name for arg in kernel.args]
             written_names = kernel.get_written_variables()
 
@@ -1195,9 +1205,7 @@ def known_callables(self):
         return callables
 
     def preamble_generators(self):
-        return ([
-            pyopencl_preamble_generator,
-            ] + super().preamble_generators())
+        return ([pyopencl_preamble_generator, *super().preamble_generators()])
 
     # }}}
 
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index be859ab70..c9191e1d1 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -23,29 +26,31 @@
 
 import logging
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Callable, Optional, Sequence
+from typing import TYPE_CHECKING, Any, Callable, Sequence
 
 import numpy as np
-from immutables import Map
 
 from pytools import memoize_method
 from pytools.codegen import CodeGenerator, Indentation
 
-from loopy.codegen.result import CodeGenerationResult
-from loopy.kernel import LoopKernel
 from loopy.kernel.data import ArrayArg
-from loopy.schedule.tools import KernelArgInfo
 from loopy.target.execution import ExecutionWrapperGeneratorBase, ExecutorBase
-from loopy.types import LoopyType
-from loopy.typing import ExpressionT, integer_expr_or_err
+from loopy.typing import Expression, integer_expr_or_err
 
 
 logger = logging.getLogger(__name__)
 
 
 if TYPE_CHECKING:
+    from immutables import Map
+
     import pyopencl as cl
 
+    from loopy.codegen.result import CodeGenerationResult
+    from loopy.kernel import LoopKernel
+    from loopy.schedule.tools import KernelArgInfo
+    from loopy.types import LoopyType
+
 
 # {{{ invoker generation
 
@@ -109,7 +114,7 @@ def handle_non_numpy_arg(self, gen: CodeGenerator, arg: ArrayArg) -> None:
 
     def handle_alloc(
             self, gen: CodeGenerator, arg: ArrayArg,
-            strify: Callable[[ExpressionT], str],
+            strify: Callable[[Expression], str],
             skip_arg_checks: bool) -> None:
         """
         Handle allocation of non-specified arguments for pyopencl execution
@@ -201,9 +206,8 @@ def generate_invocation(self, gen: CodeGenerator, kernel: LoopKernel,
 
             gen("")
 
-        arg_list = (["_lpy_cl_kernels", "queue"]
-                + list(args)
-                + ["wait_for=wait_for", "allocator=allocator"])
+        arg_list = (["_lpy_cl_kernels", "queue", *args,
+            "wait_for=wait_for", "allocator=allocator"])
         gen(f"_lpy_evt = {host_program_name}({', '.join(arg_list)})")
 
         if kernel.options.cl_exec_manage_array_events:
@@ -274,7 +278,7 @@ def get_arg_pass(self, arg):
 
 @dataclass(frozen=True)
 class _KernelInfo:
-    cl_kernels: "_Kernels"
+    cl_kernels: _Kernels
     invoker: Callable[..., Any]
 
 
@@ -292,7 +296,7 @@ class PyOpenCLExecutor(ExecutorBase):
     .. automethod:: __call__
     """
 
-    def __init__(self, context: "cl.Context", t_unit, entrypoint):
+    def __init__(self, context: cl.Context, t_unit, entrypoint):
         super().__init__(t_unit, entrypoint)
 
         self.context = context
@@ -307,7 +311,7 @@ def get_wrapper_generator(self):
     @memoize_method
     def translation_unit_info(
             self,
-            arg_to_dtype: Optional[Map[str, LoopyType]] = None) -> _KernelInfo:
+            arg_to_dtype: Map[str, LoopyType] | None = None) -> _KernelInfo:
         t_unit = self.get_typed_and_scheduled_translation_unit(arg_to_dtype)
 
         # FIXME: now just need to add the types to the arguments
diff --git a/loopy/target/python.py b/loopy/target/python.py
index 3a8747f38..1b2560402 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -1,4 +1,5 @@
 """Python host AST builder for integration with PyOpenCL."""
+from __future__ import annotations
 
 
 __copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
@@ -23,7 +24,7 @@
 THE SOFTWARE.
 """
 
-from typing import Optional, Sequence, Tuple
+from typing import TYPE_CHECKING, Sequence
 
 import numpy as np
 
@@ -31,14 +32,17 @@
 from pymbolic.mapper import Mapper
 from pymbolic.mapper.stringifier import StringifyMapper
 
-from loopy.codegen import CodeGenerationState
-from loopy.codegen.result import CodeGenerationResult
-from loopy.diagnostic import LoopyError  # noqa
+from loopy.diagnostic import LoopyError
 from loopy.kernel.data import ValueArg
 from loopy.target import ASTBuilderBase
 from loopy.type_inference import TypeReader
 
 
+if TYPE_CHECKING:
+    from loopy.codegen import CodeGenerationState
+    from loopy.codegen.result import CodeGenerationResult
+
+
 # {{{ expression to code
 
 class ExpressionToPythonMapper(StringifyMapper):
@@ -161,9 +165,7 @@ def known_callables(self):
 
     def preamble_generators(self):
         return (
-                super().preamble_generators() + [
-                    _base_python_preamble_generator
-                    ])
+                [*super().preamble_generators(), _base_python_preamble_generator])
 
     # {{{ code generation guts
 
@@ -175,7 +177,7 @@ def ast_module(self):
     def get_function_declaration(
             self, codegen_state: CodeGenerationState,
             codegen_result: CodeGenerationResult, schedule_index: int
-            ) -> Tuple[Sequence[Tuple[str, str]], Optional[Generable]]:
+            ) -> tuple[Sequence[tuple[str, str]], Generable | None]:
         return [], None
 
     def get_function_definition(self, codegen_state, codegen_result,
diff --git a/loopy/tools.py b/loopy/tools.py
index bb4904bf2..2f18cfb91 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -24,7 +27,6 @@
 import logging
 from functools import cached_property
 from sys import intern
-from typing import List
 
 import numpy as np
 from immutables import Map
@@ -136,8 +138,8 @@ def hash_key(self):
         kb = LoopyKeyBuilder()
         # Build the key. For faster hashing, avoid hashing field names.
         key = (
-            (self.class_.__name__.encode("utf-8"),) +
-            tuple(self.field_dict[k] for k in sorted(self.field_dict.keys())))
+            (self.class_.__name__.encode("utf-8"),
+                *(self.field_dict[k] for k in sorted(self.field_dict.keys()))))
 
         return kb(key)
 
@@ -242,25 +244,14 @@ def build_ispc_shared_lib(
 
     from subprocess import check_call
 
-    ispc_cmd = ([ispc_bin,
-                "--pic",
-                "-o", "ispc.o"]
-            + ispc_options
-            + list(ispc_source_names))
+    ispc_cmd = ([ispc_bin, "--pic", "-o", "ispc.o", *ispc_options, *ispc_source_names])
     if not quiet:
         print(" ".join(ispc_cmd))
 
     check_call(ispc_cmd, cwd=cwd)
 
-    cxx_cmd = ([
-                cxx_bin,
-                "-shared", "-Wl,--export-dynamic",
-                "-fPIC",
-                "-oshared.so",
-                "ispc.o",
-                ]
-            + cxx_options
-            + list(cxx_source_names))
+    cxx_cmd = ([cxx_bin, "-shared", "-Wl,--export-dynamic", "-fPIC", "-oshared.so",
+        "ispc.o", *cxx_options, *cxx_source_names])
 
     check_call(cxx_cmd, cwd=cwd)
 
@@ -279,7 +270,7 @@ def address_from_numpy(obj):
     if ary_intf is None:
         raise RuntimeError("no array interface")
 
-    buf_base, is_read_only = ary_intf["data"]
+    buf_base, _is_read_only = ary_intf["data"]
     return buf_base + ary_intf.get("offset", 0)
 
 
@@ -316,10 +307,10 @@ def empty_aligned(shape, dtype, order="C", n=64):
 
     # We now need to know how to offset base_ary
     # so it is correctly aligned
-    _array_aligned_offset = (n-address_from_numpy(base_ary)) % n
+    array_aligned_offset = (n-address_from_numpy(base_ary)) % n
 
     array = np.frombuffer(
-            base_ary[_array_aligned_offset:_array_aligned_offset-n].data,
+            base_ary[array_aligned_offset:array_aligned_offset-n].data,
             dtype=dtype).reshape(shape, order=order)
 
     return array
@@ -535,7 +526,7 @@ class Optional:
         The value, if present.
     """
 
-    __slots__ = ("has_value", "_value")
+    __slots__ = ("_value", "has_value")
 
     def __init__(self, value=_no_value):
         self.has_value = value is not _no_value
@@ -828,7 +819,7 @@ def t_unit_to_python(t_unit, var_name="t_unit",
         "from pymbolic.primitives import *",
         "import immutables",
         ])
-    body_str = "\n".join(knl_python_code_srcs + ["\n", merge_stmt])
+    body_str = "\n".join([*knl_python_code_srcs, "\n", merge_stmt])
 
     python_code = "\n".join([preamble_str, "\n", body_str])
     assert _is_generated_t_unit_the_same(python_code, var_name, t_unit)
@@ -843,7 +834,7 @@ def t_unit_to_python(t_unit, var_name="t_unit",
 
 # {{{ cache management
 
-caches: List[WriteOncePersistentDict] = []
+caches: list[WriteOncePersistentDict] = []
 
 
 def clear_in_mem_caches() -> None:
diff --git a/loopy/transform/__init__.py b/loopy/transform/__init__.py
index 625781167..9a205fe23 100644
--- a/loopy/transform/__init__.py
+++ b/loopy/transform/__init__.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py
index 7ab5e376e..16ebaa5c0 100644
--- a/loopy/transform/add_barrier.py
+++ b/loopy/transform/add_barrier.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2017 Kaushik Kulkarni"
 
 __license__ = """
@@ -89,7 +92,7 @@ def add_barrier(kernel, insn_before="", insn_after="", id_based_on=None,
                                         synchronization_kind=synchronization_kind,
                                         mem_kind=mem_kind)
 
-    new_kernel = kernel.copy(instructions=kernel.instructions + [barrier_to_add])
+    new_kernel = kernel.copy(instructions=[*kernel.instructions, barrier_to_add])
     if insn_after is not None:
         new_kernel = add_dependency(new_kernel,
                                  insn_match=insn_after,
diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py
index b527c087b..8ad7d658b 100644
--- a/loopy/transform/arithmetic.py
+++ b/loopy/transform/arithmetic.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py
index 81b5c933f..fb54dedd4 100644
--- a/loopy/transform/array_buffer_map.py
+++ b/loopy/transform/array_buffer_map.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner"
 
 __license__ = """
@@ -23,18 +26,21 @@
 
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, replace
-from typing import Any, Callable, Optional, Sequence, Tuple
+from typing import TYPE_CHECKING, Any, Callable, Sequence
 
 from typing_extensions import Self
 
 import islpy as isl
 from islpy import dim_type
-from pymbolic import ArithmeticExpressionT, var
+from pymbolic import ArithmeticExpression, var
 from pymbolic.mapper.substitutor import make_subst_func
 from pytools import memoize_method
 
 from loopy.symbolic import SubstitutionMapper, get_dependencies
-from loopy.typing import ExpressionT
+
+
+if TYPE_CHECKING:
+    from loopy.typing import Expression
 
 
 @dataclass(frozen=True)
@@ -47,7 +53,7 @@ class AccessDescriptor:
     """
 
     identifier: Any = None
-    storage_axis_exprs: Optional[Sequence[ArithmeticExpressionT]] = None
+    storage_axis_exprs: Sequence[ArithmeticExpression] | None = None
 
     def copy(self, **kwargs) -> Self:
         return replace(self, **kwargs)
@@ -72,10 +78,10 @@ def to_parameters_or_project_out(param_inames, set_inames, set):
 # {{{ construct storage->sweep map
 
 def build_per_access_storage_to_domain_map(
-        storage_axis_exprs: Sequence[ExpressionT],
+        storage_axis_exprs: Sequence[Expression],
         domain: isl.BasicSet,
         storage_axis_names: Sequence[str],
-        prime_sweep_inames: Callable[[ExpressionT], ExpressionT]
+        prime_sweep_inames: Callable[[Expression], Expression]
         ) -> isl.BasicMap:
 
     map_space = domain.space
@@ -203,10 +209,10 @@ def compute_bounds(kernel, domain, stor2sweep,
 # {{{ array-to-buffer map
 
 class ArrayToBufferMapBase(ABC):
-    non1_storage_axis_names: Tuple[str, ...]
-    storage_base_indices: Tuple[ArithmeticExpressionT, ...]
-    non1_storage_shape: Tuple[ArithmeticExpressionT, ...]
-    non1_storage_axis_flags: Tuple[ArithmeticExpressionT, ...]
+    non1_storage_axis_names: tuple[str, ...]
+    storage_base_indices: tuple[ArithmeticExpression, ...]
+    non1_storage_shape: tuple[ArithmeticExpression, ...]
+    non1_storage_axis_flags: tuple[ArithmeticExpression, ...]
 
     @abstractmethod
     def is_access_descriptor_in_footprint(self, accdesc: AccessDescriptor) -> bool:
diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py
index 04c5ea385..3a755746e 100644
--- a/loopy/transform/batch.py
+++ b/loopy/transform/batch.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -83,7 +86,7 @@ def map_subscript(self, expr, expn_state):
         if not isinstance(idx, tuple):
             idx = (idx,)
 
-        return type(expr)(expr.aggregate, (self.batch_iname_expr,) + idx)
+        return type(expr)(expr.aggregate, (self.batch_iname_expr, *idx))
 
     def map_variable(self, expr, expn_state):
         if not self.needs_batch_subscript(expr.name):
@@ -98,7 +101,7 @@ def _add_unique_dim_name(name, dim_names):
 
     from pytools import UniqueNameGenerator
     ng = UniqueNameGenerator(set(dim_names))
-    return (ng(name),) + tuple(dim_names)
+    return (ng(name), *tuple(dim_names))
 
 
 @for_each_kernel
@@ -143,7 +146,7 @@ def to_batched(kernel, nbatches, batch_varying_args, batch_iname_prefix="ibatch"
         nbatches_expr = nbatches
 
     batch_domain = isl.BasicSet(batch_dom_str)
-    new_domains = [batch_domain] + kernel.domains
+    new_domains = [batch_domain, *kernel.domains]
 
     for arg in kernel.args:
         if arg.name in batch_varying_args:
@@ -152,7 +155,7 @@ def to_batched(kernel, nbatches, batch_varying_args, batch_iname_prefix="ibatch"
                         dim_tags="c")
             else:
                 arg = arg.copy(
-                        shape=(nbatches_expr,) + arg.shape,
+                        shape=(nbatches_expr, *arg.shape),
                         dim_tags=("c",) * (len(arg.shape) + 1),
                         dim_names=_add_unique_dim_name("ibatch", arg.dim_names))
 
@@ -168,7 +171,7 @@ def to_batched(kernel, nbatches, batch_varying_args, batch_iname_prefix="ibatch"
         for temp in kernel.temporary_variables.values():
             if temp_needs_batching_if_not_sequential(temp, batch_varying_args):
                 new_temps[temp.name] = temp.copy(
-                        shape=(nbatches_expr,) + temp.shape,
+                        shape=(nbatches_expr, *temp.shape),
                         dim_tags=("c",) * (len(temp.shape) + 1),
                         dim_names=_add_unique_dim_name("ibatch", temp.dim_names))
             else:
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index c8339f550..f113e453d 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012-2015 Andreas Kloeckner"
 
 __license__ = """
@@ -125,7 +128,7 @@ def map_array_access(self, index, expn_state):
         # Can't possibly be nested, but recurse anyway to
         # make sure substitution rules referenced below here
         # do not get thrown away.
-        self.rec(result, expn_state.copy(arg_context={}))
+        self.rec(result, expn_state.copy(arg_context=Map()))
 
         return result
 
diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py
index 0210eaee2..8669a4abb 100644
--- a/loopy/transform/callable.py
+++ b/loopy/transform/callable.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2018 Kaushik Kulkarni"
 
 __license__ = """
@@ -20,7 +23,8 @@
 THE SOFTWARE.
 """
 
-from collections.abc import Sequence
+
+from typing import TYPE_CHECKING
 
 from immutables import Map
 
@@ -49,6 +53,10 @@
 from loopy.translation_unit import FunctionIdT, TranslationUnit, for_each_kernel
 
 
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
 __doc__ = """
 .. currentmodule:: loopy
 
@@ -314,7 +322,7 @@ def _inline_call_instruction(caller_knl, callee_knl, call_insn):
     parameters = call_insn.expression.parameters  # reads
 
     from loopy.kernel.function_interface import get_kw_pos_association
-    kw_to_pos, pos_to_kw = get_kw_pos_association(callee_knl)
+    _kw_to_pos, pos_to_kw = get_kw_pos_association(callee_knl)
 
     for i, par in enumerate(parameters):
         arg_map[pos_to_kw[i]] = par
diff --git a/loopy/transform/concatenate.py b/loopy/transform/concatenate.py
index cd095c462..5ecd234cd 100644
--- a/loopy/transform/concatenate.py
+++ b/loopy/transform/concatenate.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2022 Isuru Fernando"
 
 __license__ = """
@@ -25,7 +28,7 @@
 .. autofunction:: concatenate_arrays
 """
 
-from typing import List, Optional, Sequence
+from typing import Sequence
 
 import numpy as np
 
@@ -42,7 +45,7 @@
 def concatenate_arrays(
         kernel: LoopKernel,
         array_names: Sequence[str],
-        new_name: Optional[str] = None,
+        new_name: str | None = None,
         axis_nr: int = 0) -> LoopKernel:
     """Merges arrays (temporaries or arguments) into one array along the axis
     given by *axis_nr*.
@@ -125,7 +128,7 @@ def modify_array_access(expr):
         new_tvs[new_name] = new_ary
         return kernel.copy(temporary_variables=new_tvs)
     elif isinstance(new_ary, ArrayArg):
-        new_args: List[KernelArgument] = []
+        new_args: list[KernelArgument] = []
         inserted = False
         for arg in kernel.args:
             if arg.name in array_names:
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index c63604f8c..80a0c4a12 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -21,7 +24,7 @@
 """
 
 from dataclasses import dataclass, replace
-from typing import Dict, Optional, Tuple, cast
+from typing import TYPE_CHECKING, cast
 from warnings import warn
 
 import numpy as np
@@ -36,7 +39,10 @@
 from loopy.kernel.function_interface import CallableKernel, ScalarCallable
 from loopy.translation_unit import TranslationUnit, for_each_kernel
 from loopy.types import LoopyType
-from loopy.typing import ExpressionT
+
+
+if TYPE_CHECKING:
+    from loopy.typing import Expression
 
 
 # {{{ convenience: add_prefetch
@@ -124,7 +130,7 @@ def _process_footprint_subscripts(kernel, rule_name, sweep_inames,
 
                 kernel = _add_kernel_axis(kernel, axis_name, 0, arg.shape[axis_nr],
                         frozenset(sweep_inames) | fsub_dependencies)
-                sweep_inames = sweep_inames + [axis_name]
+                sweep_inames = [*sweep_inames, axis_name]
 
                 inames_to_be_removed.append(axis_name)
                 new_fsub.append(Variable(axis_name))
@@ -229,10 +235,10 @@ def add_prefetch_for_single_kernel(kernel, callables_table, var_name,
     from pymbolic import var
     uni_template = parsed_var_name
     if len(parameters) > 1:
-        uni_template = uni_template.index(
-                tuple(var(par_name) for par_name in parameters))
+        uni_template = uni_template[
+                tuple(var(par_name) for par_name in parameters)]
     elif len(parameters) == 1:
-        uni_template = uni_template.index(var(parameters[0]))
+        uni_template = uni_template[var(parameters[0])]
 
     # }}}
 
@@ -984,11 +990,11 @@ def add_padding_to_avoid_bank_conflicts(kernel, device):
 @dataclass(frozen=True)
 class _BaseStorageInfo:
     name: str
-    next_offset: ExpressionT
-    approx_nbytes: Optional[int] = None
+    next_offset: Expression
+    approx_nbytes: int | None = None
 
 
-def _sym_max(a: ExpressionT, b: ExpressionT) -> ExpressionT:
+def _sym_max(a: Expression, b: Expression) -> Expression:
     from numbers import Number
     if isinstance(a, Number) and isinstance(b, Number):
         return max(a, b)
@@ -999,9 +1005,9 @@ def _sym_max(a: ExpressionT, b: ExpressionT) -> ExpressionT:
 
 @for_each_kernel
 def allocate_temporaries_for_base_storage(kernel: LoopKernel,
-        only_address_space: Optional[int] = None,
+        only_address_space: int | None = None,
         aliased=True,
-        max_nbytes: Optional[int] = None,
+        max_nbytes: int | None = None,
         ) -> LoopKernel:
     from pytools import product
 
@@ -1010,8 +1016,8 @@ def allocate_temporaries_for_base_storage(kernel: LoopKernel,
 
     vng = kernel.get_var_name_generator()
 
-    name_aspace_dtype_to_bsi: Dict[
-            Tuple[str, AddressSpace, LoopyType], _BaseStorageInfo] = {}
+    name_aspace_dtype_to_bsi: dict[
+            tuple[str, AddressSpace, LoopyType], _BaseStorageInfo] = {}
 
     for tv in sorted(
             kernel.temporary_variables.values(),
@@ -1052,7 +1058,7 @@ def allocate_temporaries_for_base_storage(kernel: LoopKernel,
                 approx_array_nbytes = 0
 
             bs_key = (tv.base_storage,
-                      cast(AddressSpace, tv.address_space), tv.dtype)
+                      cast("AddressSpace", tv.address_space), tv.dtype)
             bsi = name_aspace_dtype_to_bsi.get(bs_key)
 
             if bsi is None or (
diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py
index 6c2688d90..c29a1895a 100644
--- a/loopy/transform/diff.py
+++ b/loopy/transform/diff.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
 
 __license__ = """
diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py
index 8e047c036..b16d837f6 100644
--- a/loopy/transform/fusion.py
+++ b/loopy/transform/fusion.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 795154099..c68c8be53 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -22,7 +25,7 @@
 
 
 from collections.abc import Collection, Iterable, Mapping, Sequence
-from typing import Any, FrozenSet, Optional
+from typing import TYPE_CHECKING, Any
 
 from typing_extensions import TypeAlias
 
@@ -33,8 +36,6 @@
 from loopy.diagnostic import LoopyError
 from loopy.kernel import LoopKernel
 from loopy.kernel.function_interface import CallableKernel
-from loopy.kernel.instruction import InstructionBase
-from loopy.match import ToStackMatchCovertible
 from loopy.symbolic import (
     RuleAwareIdentityMapper,
     RuleAwareSubstitutionMapper,
@@ -43,6 +44,11 @@
 from loopy.translation_unit import TranslationUnit, for_each_kernel
 
 
+if TYPE_CHECKING:
+    from loopy.kernel.instruction import InstructionBase
+    from loopy.match import ToStackMatchConvertible
+
+
 __doc__ = """
 .. currentmodule:: loopy
 
@@ -296,16 +302,16 @@ def _split_iname_backend(kernel, iname_to_split,
         new_prio = ()
         for prio_iname in prio:
             if prio_iname == iname_to_split:
-                new_prio = new_prio + (outer_iname, inner_iname)
+                new_prio = (*new_prio, outer_iname, inner_iname)
             else:
-                new_prio = new_prio + (prio_iname,)
+                new_prio = (*new_prio, prio_iname)
         new_priorities.append(new_prio)
 
     kernel = kernel.copy(
             domains=new_domains,
             iname_slab_increments=iname_slab_increments,
             instructions=new_insns,
-            applied_iname_rewrites=kernel.applied_iname_rewrites+(subst_map,),
+            applied_iname_rewrites=(*kernel.applied_iname_rewrites, subst_map),
             loop_priority=frozenset(new_priorities))
 
     rule_mapping_context = SubstitutionRuleMappingContext(
@@ -630,7 +636,7 @@ def subst_within_inames(fid):
             .copy(
                 instructions=new_insns,
                 domains=domch.get_domains_with(new_domain),
-                applied_iname_rewrites=kernel.applied_iname_rewrites + (subst_dict,)
+                applied_iname_rewrites=(*kernel.applied_iname_rewrites, subst_dict)
                 ))
 
     from loopy.match import parse_stack_match
@@ -1051,7 +1057,7 @@ def get_iname_duplication_options(kernel):
     if isinstance(kernel, TranslationUnit):
         if len([clbl for clbl in kernel.callables_table.values() if
                 isinstance(clbl, CallableKernel)]) == 1:
-            kernel = kernel[list(kernel.entrypoints)[0]]
+            kernel = kernel[next(iter(kernel.entrypoints))]
 
     assert isinstance(kernel, LoopKernel)
 
@@ -1096,7 +1102,7 @@ def has_schedulable_iname_nesting(kernel):
     if isinstance(kernel, TranslationUnit):
         if len([clbl for clbl in kernel.callables_table.values() if
                 isinstance(clbl, CallableKernel)]) == 1:
-            kernel = kernel[list(kernel.entrypoints)[0]]
+            kernel = kernel[next(iter(kernel.entrypoints))]
     return not bool(next(get_iname_duplication_options(kernel), False))
 
 # }}}
@@ -1398,7 +1404,7 @@ def parse_equation(eqn):
             rule_mapping_context.finish_kernel(
                 old_to_new.map_kernel(kernel))
             .copy(
-                applied_iname_rewrites=kernel.applied_iname_rewrites + (subst_dict,)
+                applied_iname_rewrites=(*kernel.applied_iname_rewrites, subst_dict)
                 ))
 
     # }}}
@@ -1744,7 +1750,7 @@ def add_inames_to_insn(kernel, inames, insn_match):
 # {{{ remove_inames_from_insn
 
 @for_each_kernel
-def remove_inames_from_insn(kernel: LoopKernel, inames: FrozenSet[str],
+def remove_inames_from_insn(kernel: LoopKernel, inames: frozenset[str],
         insn_match) -> LoopKernel:
     """
     :arg inames: a frozenset of inames that will be added to the
@@ -1832,7 +1838,7 @@ def remove_predicates_from_insn(kernel, predicates, insn_match):
 
 class _MapDomainMapper(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context, new_inames, substitutions):
-        super(_MapDomainMapper, self).__init__(rule_mapping_context)
+        super().__init__(rule_mapping_context)
 
         self.old_inames = frozenset(substitutions)
         self.new_inames = new_inames
@@ -1852,7 +1858,7 @@ def map_reduction(self, expr, expn_state):
             if arg_ctx_overlap:
                 if arg_ctx_overlap == red_overlap:
                     # All variables are shadowed by context, that's OK.
-                    return super(_MapDomainMapper, self).map_reduction(
+                    return super().map_reduction(
                             expr, expn_state)
                 else:
                     raise LoopyError("Reduction '%s' has"
@@ -1871,14 +1877,14 @@ def map_reduction(self, expr, expn_state):
                         self.rec(expr.expr, expn_state),
                         expr.allow_simultaneous)
         else:
-            return super(_MapDomainMapper, self).map_reduction(expr, expn_state)
+            return super().map_reduction(expr, expn_state)
 
     def map_variable(self, expr, expn_state):
         if (expr.name in self.old_inames
                 and expr.name not in expn_state.arg_context):
             return self.substitutions[expr.name]
         else:
-            return super(_MapDomainMapper, self).map_variable(expr, expn_state)
+            return super().map_variable(expr, expn_state)
 
 # }}}
 
@@ -2082,7 +2088,7 @@ def map_domain(kernel, transform_map):
         substitutions[iname] = subst_from_map
         var_substitutions[var(iname)] = subst_from_map
 
-    applied_iname_rewrites = applied_iname_rewrites + (var_substitutions,)
+    applied_iname_rewrites = (*applied_iname_rewrites, var_substitutions)
     del var_substitutions
 
     # }}}
@@ -2375,8 +2381,8 @@ def rename_inames(
             old_inames: Collection[str],
             new_iname: str,
             existing_ok: bool = False,
-            within: ToStackMatchCovertible = None,
-            raise_on_domain_mismatch: Optional[bool] = None
+            within: ToStackMatchConvertible = None,
+            raise_on_domain_mismatch: bool | None = None
         ) -> LoopKernel:
     r"""
     :arg old_inames: A collection of inames that must be renamed to **new_iname**.
@@ -2519,9 +2525,9 @@ def rename_iname(
             old_iname: str,
             new_iname: str,
             existing_ok: bool = False,
-            within: ToStackMatchCovertible = None,
+            within: ToStackMatchConvertible = None,
             preserve_tags: bool = True,
-            raise_on_domain_mismatch: Optional[bool] = None
+            raise_on_domain_mismatch: bool | None = None
         ) -> LoopKernel:
     r"""
     Single iname version of :func:`loopy.rename_inames`.
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 494bbf0bc..ec876ea03 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -20,16 +23,19 @@
 THE SOFTWARE.
 """
 
-from typing import List, Mapping, Sequence, Tuple
+from typing import TYPE_CHECKING, Mapping, Sequence
 
 from loopy.diagnostic import LoopyError
 from loopy.kernel import LoopKernel
 from loopy.kernel.function_interface import CallableKernel, ScalarCallable
-from loopy.kernel.instruction import InstructionBase
 from loopy.symbolic import RuleAwareIdentityMapper
 from loopy.translation_unit import TranslationUnit, for_each_kernel
 
 
+if TYPE_CHECKING:
+    from loopy.kernel.instruction import InstructionBase
+
+
 # {{{ find_instructions
 
 def find_instructions_in_single_kernel(kernel, insn_match):
@@ -263,8 +269,8 @@ def replace_instruction_ids_in_insn(
         ) -> InstructionBase:
     changed = False
     new_depends_on = list(insn.depends_on)
-    extra_depends_on: List[str] = []
-    new_no_sync_with: List[Tuple[str, str]] = []
+    extra_depends_on: list[str] = []
+    new_no_sync_with: list[tuple[str, str]] = []
 
     if insn.id in replacements:
         assert isinstance(insn.id, str)
diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py
index 2a82952c2..9dc5f9a9b 100644
--- a/loopy/transform/pack_and_unpack_args.py
+++ b/loopy/transform/pack_and_unpack_args.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2018 Tianjiao Sun, Kaushik Kulkarni"
 
 __license__ = """
@@ -222,9 +225,9 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel,
                 new_indices = tuple(simplify_via_aff(i) for i in new_indices)
 
                 pack_lhs_assignee = pack_subst_mapper(
-                        var(pack_name).index(new_indices))
+                        var(pack_name)[new_indices])
                 unpack_rhs = unpack_subst_mapper(
-                        var(pack_name).index(new_indices))
+                        var(pack_name)[new_indices])
 
                 # }}}
 
@@ -266,13 +269,13 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel,
                         in_knl_callable.arg_id_to_descr[arg_id].shape):
                     iname_set = iname_set & make_slab(space, iname.name, 0,
                             axis_length)
-                new_domains = new_domains + [iname_set]
+                new_domains = [*new_domains, iname_set]
 
                 # }}}
 
                 new_id_to_parameters[arg_id] = SubArrayRef(
                         tuple(updated_swept_inames),
-                        (var(pack_name).index(tuple(updated_swept_inames))))
+                        (var(pack_name)[tuple(updated_swept_inames)]))
             else:
                 new_id_to_parameters[arg_id] = p
 
@@ -290,8 +293,8 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel,
                         new_ilp_inames),
                     expression=new_call_insn.expression.function(*new_params),
                     assignees=new_assignees)
-            old_insn_to_new_insns[insn.id] = (packing_insns + [new_call_insn] +
-                    unpacking_insns)
+            old_insn_to_new_insns[insn.id] = ([
+                *packing_insns, new_call_insn, *unpacking_insns])
 
     if old_insn_to_new_insns:
         new_instructions = []
diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py
index 76deccc44..9afc28f9f 100644
--- a/loopy/transform/padding.py
+++ b/loopy/transform/padding.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py
index fb1bc0b71..73732a16b 100644
--- a/loopy/transform/parameter.py
+++ b/loopy/transform/parameter.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index b0fbb5468..3988b1f5d 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -22,19 +25,17 @@
 
 
 from dataclasses import dataclass
-from typing import FrozenSet, List, Optional, Sequence, Type, Union, cast
+from typing import TYPE_CHECKING, Sequence, cast
 
 import numpy as np
 from immutables import Map
 
 import islpy as isl
-from pymbolic import ArithmeticExpressionT, var
+from pymbolic import ArithmeticExpression, var
 from pymbolic.mapper.substitutor import make_subst_func
 from pytools import memoize_on_first_arg
-from pytools.tag import Tag
 
 from loopy.diagnostic import LoopyError
-from loopy.kernel import LoopKernel
 from loopy.kernel.data import AddressSpace
 from loopy.kernel.function_interface import CallableKernel, ScalarCallable
 from loopy.kernel.instruction import InstructionBase, MultiAssignmentBase
@@ -42,7 +43,6 @@
     find_most_recent_global_barrier,
     kernel_has_global_barriers,
 )
-from loopy.match import ToStackMatchCovertible
 from loopy.symbolic import (
     CombineMapper,
     RuleAwareIdentityMapper,
@@ -60,7 +60,7 @@
 from loopy.translation_unit import CallablesTable, TranslationUnit
 from loopy.types import LoopyType, ToLoopyTypeConvertible, to_loopy_type
 from loopy.typing import (
-    ExpressionT,
+    Expression,
     auto,
     integer_expr_or_err,
     integer_or_err,
@@ -68,6 +68,13 @@
 )
 
 
+if TYPE_CHECKING:
+    from pytools.tag import Tag
+
+    from loopy.kernel import LoopKernel
+    from loopy.match import ToStackMatchConvertible
+
+
 # {{{ contains_subst_rule_invocation
 
 class FunctionNameCollector(CombineMapper):
@@ -101,9 +108,9 @@ def _get_called_names(insn):
     assert isinstance(insn, MultiAssignmentBase)
     from functools import reduce
 
-    from pymbolic.primitives import Expression
+    from pymbolic.primitives import ExpressionNode
     return ((_get_calls_in_expr(insn.expression)
-             if isinstance(insn.expression, Expression)
+             if isinstance(insn.expression, ExpressionNode)
              else frozenset())
             # indices of assignees might call the subst rules
             | reduce(frozenset.union,
@@ -113,7 +120,7 @@ def _get_called_names(insn):
             | reduce(frozenset.union,
                      (_get_calls_in_expr(pred)
                       for pred in insn.predicates
-                      if isinstance(pred, Expression)),
+                      if isinstance(pred, ExpressionNode)),
                      frozenset())
             )
 
@@ -133,14 +140,14 @@ def contains_a_subst_rule_invocation(kernel, insn):
 
 @dataclass(frozen=True)
 class RuleAccessDescriptor(AccessDescriptor):
-    args: Optional[Sequence[ArithmeticExpressionT]] = None
+    args: Sequence[ArithmeticExpression] | None = None
 
 
 def access_descriptor_id(args, expansion_stack):
     return (args, expansion_stack)
 
 
-def storage_axis_exprs(storage_axis_sources, args) -> Sequence[ExpressionT]:
+def storage_axis_exprs(storage_axis_sources, args) -> Sequence[Expression]:
     result = []
 
     for saxis_source in storage_axis_sources:
@@ -170,7 +177,7 @@ def __init__(self, rule_mapping_context, kernel, subst_name, subst_tag, within)
 
         self.access_descriptors: list[RuleAccessDescriptor] = []
 
-    def map_substitution(self, name, tag, arguments, expn_state):
+    def map_subst_rule(self, name, tag, arguments, expn_state):
         process_me = name == self.subst_name
 
         if self.subst_tag is not None and self.subst_tag != tag:
@@ -182,7 +189,7 @@ def map_substitution(self, name, tag, arguments, expn_state):
                 expn_state.stack)
 
         if not process_me:
-            return super().map_substitution(
+            return super().map_subst_rule(
                     name, tag, arguments, expn_state)
 
         rule = self.rule_mapping_context.old_subst_rules[name]
@@ -207,7 +214,7 @@ def map_substitution(self, name, tag, arguments, expn_state):
                         ", ".join(arg_deps - self.kernel.all_inames()),
                         ), stacklevel=1)
 
-            return super().map_substitution(
+            return super().map_subst_rule(
                     name, tag, arguments, expn_state)
 
         args = [arg_context[arg_name] for arg_name in rule.arguments]
@@ -252,7 +259,7 @@ def __init__(self, rule_mapping_context, subst_name, subst_tag, within,
         self.compute_read_variables = compute_read_variables
         self.compute_insn_depends_on = set()
 
-    def map_substitution(self, name, tag, arguments, expn_state):
+    def map_subst_rule(self, name, tag, arguments, expn_state):
         if not (
                 name == self.subst_name
                 and self.within(
@@ -260,7 +267,7 @@ def map_substitution(self, name, tag, arguments, expn_state):
                     expn_state.instruction,
                     expn_state.stack)
                 and (self.subst_tag is None or self.subst_tag == tag)):
-            return super().map_substitution(
+            return super().map_subst_rule(
                     name, tag, arguments, expn_state)
 
         # {{{ check if in footprint
@@ -275,7 +282,7 @@ def map_substitution(self, name, tag, arguments, expn_state):
                     self.storage_axis_sources, args))
 
         if not self.array_base_map.is_access_descriptor_in_footprint(accdesc):
-            return super().map_substitution(
+            return super().map_subst_rule(
                     name, tag, arguments, expn_state)
 
         # }}}
@@ -380,20 +387,20 @@ def precompute_for_single_kernel(
         callables_table: CallablesTable,
         subst_use,
         sweep_inames=None,
-        within: ToStackMatchCovertible = None,
+        within: ToStackMatchConvertible = None,
         *,
         storage_axes=None,
-        temporary_name: Optional[str] = None,
-        precompute_inames: Optional[Sequence[str]] = None,
-        precompute_outer_inames: Optional[FrozenSet[str]] = None,
+        temporary_name: str | None = None,
+        precompute_inames: Sequence[str] | None = None,
+        precompute_outer_inames: frozenset[str] | None = None,
         storage_axis_to_tag=None,
 
-        default_tag: Union[None, Tag, str] = None,
+        default_tag: Tag | str | None = None,
 
-        dtype: Optional[ToLoopyTypeConvertible] = None,
+        dtype: ToLoopyTypeConvertible | None = None,
         fetch_bounding_box: bool = False,
-        temporary_address_space: Union[AddressSpace, None, Type[auto]] = None,
-        compute_insn_id: Optional[str] = None,
+        temporary_address_space: AddressSpace | type[auto] | None = None,
+        compute_insn_id: str | None = None,
         _enable_mirgecom_workaround: bool = False,
         ) -> LoopKernel:
     """Precompute the expression described in the substitution rule determined by
@@ -514,7 +521,7 @@ def precompute_for_single_kernel(
 
     footprint_generators = None
 
-    subst_name: Optional[str] = None
+    subst_name: str | None = None
     subst_tag = None
 
     from pymbolic.primitives import Call, Variable
@@ -577,9 +584,9 @@ def precompute_for_single_kernel(
 
         for fpg in footprint_generators:
             if isinstance(fpg, Variable):
-                args: tuple[ArithmeticExpressionT, ...] = ()
+                args: tuple[ArithmeticExpression, ...] = ()
             elif isinstance(fpg, Call):
-                args = cast(tuple[ArithmeticExpressionT, ...], fpg.parameters)
+                args = cast("tuple[ArithmeticExpression, ...]", fpg.parameters)
             else:
                 raise ValueError("footprint generator must "
                         "be substitution rule invocation")
@@ -674,8 +681,8 @@ def precompute_for_single_kernel(
 
     prior_storage_axis_name_dict = {}
 
-    storage_axis_names: List[str] = []
-    storage_axis_sources: List[Union[str, int]] = []  # number for arg#, or iname
+    storage_axis_names: list[str] = []
+    storage_axis_sources: list[str | int] = []  # number for arg#, or iname
 
     # {{{ check for pre-existing precompute_inames
 
@@ -772,8 +779,7 @@ def precompute_for_single_kernel(
             if abm.non1_storage_axis_flags[i]:
                 non1_storage_axis_names.append(saxis)
             else:
-                if saxis in new_iname_to_tag:
-                    del new_iname_to_tag[saxis]
+                new_iname_to_tag.pop(saxis, None)
 
                 if saxis in preexisting_precompute_inames:
                     raise LoopyError("precompute axis %d (1-based) was "
@@ -922,8 +928,8 @@ def add_assumptions(d):
         # should.
 
         if _enable_mirgecom_workaround:
-            from pymbolic.primitives import Expression
-            if is_length_1 and not isinstance(base_index, Expression):
+            from pymbolic.primitives import ExpressionNode
+            if is_length_1 and not isinstance(base_index, ExpressionNode):
                 # I.e. base_index is an integer.
                 from pytools import is_single_valued
                 if is_single_valued(
@@ -963,7 +969,7 @@ def add_assumptions(d):
             # within_inames determined below
             )
     compute_dep_id = compute_insn_id
-    added_compute_insns: List[InstructionBase] = [compute_insn]
+    added_compute_insns: list[InstructionBase] = [compute_insn]
 
     if temporary_address_space == AddressSpace.GLOBAL:
         barrier_insn_id = kernel.make_unique_instruction_id(
@@ -1028,7 +1034,7 @@ def add_assumptions(d):
                 and insn.within_inames & prior_storage_axis_names):
             insn = (insn
                     .with_transformed_expressions(
-                        lambda expr: expr_subst_map(expr, kernel, insn))  # noqa: B023,E501
+                        lambda expr: expr_subst_map(expr, kernel, insn))  # noqa: B023
                     .copy(within_inames=frozenset(
                         new_iname
                         for iname in insn.within_inames
diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index ca31368d2..6e7eb1fb6 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
 
 __license__ = """
diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py
index 7d1f3c870..5f504e722 100644
--- a/loopy/transform/realize_reduction.py
+++ b/loopy/transform/realize_reduction.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = """
 Copyright (C) 2012 Andreas Kloeckner
 Copyright (C) 2022 University of Illinois Board of Trustees
@@ -26,7 +29,7 @@
 
 import logging
 from dataclasses import dataclass, replace
-from typing import Callable, Dict, FrozenSet, List, Optional, Sequence, Set, Tuple
+from typing import TYPE_CHECKING, Callable, Sequence
 
 
 logger = logging.getLogger(__name__)
@@ -34,12 +37,9 @@
 from immutables import Map
 
 import islpy as isl
-from pymbolic.primitives import Expression
 from pytools import memoize_on_first_arg
-from pytools.tag import Tag
 
 from loopy.diagnostic import LoopyError, ReductionIsNotTriangularError, warn_with_kernel
-from loopy.kernel import LoopKernel
 from loopy.kernel.data import AddressSpace, TemporaryVariable, make_assignment
 from loopy.kernel.function_interface import CallableKernel
 from loopy.kernel.instruction import Assignment, InstructionBase, MultiAssignmentBase
@@ -48,6 +48,13 @@
 from loopy.translation_unit import ConcreteCallablesTable, TranslationUnit
 
 
+if TYPE_CHECKING:
+    from pymbolic.primitives import ExpressionNode
+    from pytools.tag import Tag
+
+    from loopy.kernel import LoopKernel
+
+
 # {{{ reduction realization context
 
 @dataclass
@@ -59,14 +66,14 @@ class _ChangeFlag:
 class _ReductionRealizationContext:
     # {{{ read-only
 
-    mapper: "RealizeReductionCallbackMapper"
+    mapper: RealizeReductionCallbackMapper
 
     force_scan: bool
     automagic_scans_ok: bool
     unknown_types_ok: bool
 
     # FIXME: This feels like a broken-by-design concept.
-    force_outer_iname_for_scan: Optional[str]
+    force_outer_iname_for_scan: str | None
 
     # We use the original kernel for a number of lookups whose value
     # we do not change and which might be already cached on it.
@@ -82,17 +89,17 @@ class _ReductionRealizationContext:
     insn_id_gen: Callable[[str], str]
     var_name_gen: Callable[[str], str]
 
-    additional_temporary_variables: Dict[str, TemporaryVariable]
-    additional_insns: List[InstructionBase]
-    domains: List[isl.BasicSet]
-    additional_iname_tags: Dict[str, Sequence[Tag]]
+    additional_temporary_variables: dict[str, TemporaryVariable]
+    additional_insns: list[InstructionBase]
+    domains: list[isl.BasicSet]
+    additional_iname_tags: dict[str, Sequence[Tag]]
     # list only to facilitate mutation
-    boxed_callables_table: List[ConcreteCallablesTable]
+    boxed_callables_table: list[ConcreteCallablesTable]
 
     # FIXME: This is a broken-by-design concept. Local-parallel scans emit a
     # reduction internally. This serves to avoid force_scan acting on that
     # reduction.
-    inames_added_for_scan: Set[str]
+    inames_added_for_scan: set[str]
 
     # }}}
 
@@ -100,10 +107,10 @@ class _ReductionRealizationContext:
 
     # These are attributes from 'surrounding' instruction, for generated
     # instructions to potentially inherit.
-    surrounding_within_inames: FrozenSet[str]
-    surrounding_depends_on: FrozenSet[str]
-    surrounding_no_sync_with: FrozenSet[Tuple[str, str]]
-    surrounding_predicates: FrozenSet[Expression]
+    surrounding_within_inames: frozenset[str]
+    surrounding_depends_on: frozenset[str]
+    surrounding_no_sync_with: frozenset[tuple[str, str]]
+    surrounding_predicates: frozenset[ExpressionNode]
 
     # }}}
 
@@ -113,10 +120,10 @@ class _ReductionRealizationContext:
     # These are requested additions to attributes of the surrounding instruction.
 
     # FIXME add_within_inames seems broken by design.
-    surrounding_insn_add_within_inames: Set[str]
+    surrounding_insn_add_within_inames: set[str]
 
-    surrounding_insn_add_depends_on: Set[str]
-    surrounding_insn_add_no_sync_with: Set[Tuple[str, str]]
+    surrounding_insn_add_depends_on: set[str]
+    surrounding_insn_add_no_sync_with: set[tuple[str, str]]
 
     # }}}
 
@@ -171,9 +178,9 @@ def get_insn_kwargs(self):
 
 @dataclass(frozen=True)
 class _InameClassification:
-    sequential: Tuple[str, ...]
-    local_parallel: Tuple[str, ...]
-    nonlocal_parallel: Tuple[str, ...]
+    sequential: tuple[str, ...]
+    local_parallel: tuple[str, ...]
+    nonlocal_parallel: tuple[str, ...]
 
 
 def _classify_reduction_inames(red_realize_ctx, inames):
@@ -1124,7 +1131,7 @@ def map_reduction_local(red_realize_ctx, expr, nresults, arg_dtypes,
             red_realize_ctx=red_realize_ctx,
             name_based_on="acc_"+red_iname,
             nvars=nresults,
-            shape=outer_local_iname_sizes + (size,),
+            shape=(*outer_local_iname_sizes, size),
             dtypes=reduction_dtypes,
             address_space=AddressSpace.LOCAL)
 
@@ -1151,7 +1158,7 @@ def map_reduction_local(red_realize_ctx, expr, nresults, arg_dtypes,
     init_insn = make_assignment(
             id=init_id,
             assignees=tuple(
-                acc_var[outer_local_iname_vars + (var(base_exec_iname),)]
+                acc_var[(*outer_local_iname_vars, var(base_exec_iname))]
                 for acc_var in acc_vars),
             expression=neutral,
             within_inames=(
@@ -1234,7 +1241,7 @@ def map_reduction_local(red_realize_ctx, expr, nresults, arg_dtypes,
     transfer_insn = make_assignment(
             id=transfer_id,
             assignees=tuple(
-                acc_var[outer_local_iname_vars + (var(red_iname),)]
+                acc_var[(*outer_local_iname_vars, var(red_iname))]
                 for acc_var in acc_vars),
             expression=expression,
             **transfer_red_realize_ctx.get_insn_kwargs())
@@ -1269,12 +1276,11 @@ def map_reduction_local(red_realize_ctx, expr, nresults, arg_dtypes,
                 arg_dtypes,
                 _strip_if_scalar(acc_vars, tuple(
                     acc_var[
-                        outer_local_iname_vars + (var(stage_exec_iname),)]
+                        (*outer_local_iname_vars, var(stage_exec_iname))]
                     for acc_var in acc_vars)),
                 _strip_if_scalar(acc_vars, tuple(
                     acc_var[
-                        outer_local_iname_vars + (
-                            var(stage_exec_iname) + new_size,)]
+                        (*outer_local_iname_vars, var(stage_exec_iname) + new_size)]
                     for acc_var in acc_vars)),
                 red_realize_ctx.boxed_callables_table[0],
                 orig_kernel.target)
@@ -1282,7 +1288,7 @@ def map_reduction_local(red_realize_ctx, expr, nresults, arg_dtypes,
         stage_insn = make_assignment(
                 id=stage_id,
                 assignees=tuple(
-                    acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
+                    acc_var[(*outer_local_iname_vars, var(stage_exec_iname))]
                     for acc_var in acc_vars),
                 expression=expression,
                 within_inames=(
@@ -1307,9 +1313,9 @@ def map_reduction_local(red_realize_ctx, expr, nresults, arg_dtypes,
 
     if nresults == 1:
         assert len(acc_vars) == 1
-        return acc_vars[0][outer_local_iname_vars + (0,)]
+        return acc_vars[0][(*outer_local_iname_vars, 0)]
     else:
-        return [acc_var[outer_local_iname_vars + (0,)] for acc_var in
+        return [acc_var[(*outer_local_iname_vars, 0)] for acc_var in
                 acc_vars]
 # }}}
 
@@ -1419,7 +1425,7 @@ def map_scan_seq(red_realize_ctx, expr, nresults, arg_dtypes,
             assignees=acc_vars,
             within_inames=(
                 red_realize_ctx.surrounding_within_inames
-                - frozenset((scan_param.sweep_iname,) + expr.inames)),
+                - frozenset((scan_param.sweep_iname, *expr.inames))),
             within_inames_is_final=True,
             depends_on=init_insn_depends_on,
             expression=expression,
@@ -1558,7 +1564,7 @@ def map_scan_local(red_realize_ctx, expr, nresults, arg_dtypes,
             red_realize_ctx=red_realize_ctx,
             name_based_on="acc_"+scan_param.scan_iname,
             nvars=nresults,
-            shape=outer_local_iname_sizes + (scan_size,),
+            shape=(*outer_local_iname_sizes, scan_size),
             dtypes=reduction_dtypes,
             address_space=AddressSpace.LOCAL)
 
@@ -1579,7 +1585,7 @@ def map_scan_local(red_realize_ctx, expr, nresults, arg_dtypes,
     init_insn = make_assignment(
             id=init_id,
             assignees=tuple(
-                acc_var[outer_local_iname_vars + (var(base_exec_iname),)]
+                acc_var[(*outer_local_iname_vars, var(base_exec_iname))]
                 for acc_var in acc_vars),
             expression=neutral,
             within_inames=base_iname_deps | frozenset([base_exec_iname]),
@@ -1640,8 +1646,10 @@ def map_scan_local(red_realize_ctx, expr, nresults, arg_dtypes,
                 f"{red_realize_ctx.id_prefix}_{scan_param.scan_iname}_transfer")
         transfer_insn = make_assignment(
                 id=transfer_id,
-                assignees=(acc_var[outer_local_iname_vars
-                    + (var(scan_param.sweep_iname) - sweep_lower_bound_expr,)],),
+                assignees=(acc_var[(
+                    *outer_local_iname_vars,
+                    var(scan_param.sweep_iname) - sweep_lower_bound_expr)
+                ],),
                 expression=pre_scan_result_i,
                 within_inames=(
                     red_realize_ctx.surrounding_within_inames
@@ -1684,8 +1692,8 @@ def map_scan_local(red_realize_ctx, expr, nresults, arg_dtypes,
                     assignees=(read_var,),
                     expression=(
                             acc_var[
-                                outer_local_iname_vars
-                                + (var(stage_exec_iname) - cur_size,)]),
+                                (*outer_local_iname_vars,
+                                    var(stage_exec_iname) - cur_size)]),
                     within_inames=(
                         base_iname_deps | frozenset([stage_exec_iname])),
                     within_inames_is_final=True,
@@ -1713,7 +1721,7 @@ def map_scan_local(red_realize_ctx, expr, nresults, arg_dtypes,
             _strip_if_scalar(acc_vars, read_vars),
             _strip_if_scalar(acc_vars, tuple(
                 acc_var[
-                    outer_local_iname_vars + (var(stage_exec_iname),)]
+                    (*outer_local_iname_vars, var(stage_exec_iname))]
                 for acc_var in acc_vars)),
             red_realize_ctx.boxed_callables_table[0],
             orig_kernel.target)
@@ -1721,7 +1729,7 @@ def map_scan_local(red_realize_ctx, expr, nresults, arg_dtypes,
         write_stage_insn = make_assignment(
                 id=write_stage_id,
                 assignees=tuple(
-                    acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
+                    acc_var[(*outer_local_iname_vars, var(stage_exec_iname))]
                     for acc_var in acc_vars),
                 expression=expression,
                 within_inames=(
@@ -1744,9 +1752,9 @@ def map_scan_local(red_realize_ctx, expr, nresults, arg_dtypes,
 
     if nresults == 1:
         assert len(acc_vars) == 1
-        return acc_vars[0][outer_local_iname_vars + (output_idx,)]
+        return acc_vars[0][(*outer_local_iname_vars, output_idx)]
     else:
-        return [acc_var[outer_local_iname_vars + (output_idx,)]
+        return [acc_var[(*outer_local_iname_vars, output_idx)]
                 for acc_var in acc_vars]
 
 # }}}
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index bd25dec36..e1dbfd99d 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2016 Matt Wala"
 
 __license__ = """
@@ -77,7 +80,7 @@ def get_successor_relation(self):
 
         for idx, (item, next_item) in enumerate(zip(
                 reversed(self.schedule),
-                reversed(self.schedule + [None]))):
+                reversed([*self.schedule, None]))):
             sched_idx = len(self.schedule) - idx - 1
 
             # Look at next_item
@@ -760,7 +763,7 @@ def save_and_reload_temporaries(program, entrypoint=None):
     if entrypoint is None:
         if len(program.entrypoints) != 1:
             raise LoopyError("Missing argument 'entrypoint'.")
-        entrypoint = list(program.entrypoints)[0]
+        entrypoint = next(iter(program.entrypoints))
 
     knl = program[entrypoint]
 
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index 422d22568..3ca981aa0 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -36,7 +39,7 @@
 
 
 class ExprDescriptor(ImmutableRecord):
-    __slots__ = ["insn", "expr", "unif_var_dict"]
+    __slots__ = ["expr", "insn", "unif_var_dict"]
 
 
 # {{{ extract_subst
diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py
index ed68bb36e..6826876b0 100644
--- a/loopy/translation_unit.py
+++ b/loopy/translation_unit.py
@@ -31,9 +31,7 @@
     TYPE_CHECKING,
     Any,
     Callable,
-    FrozenSet,
     Mapping,
-    Optional,
     TypeVar,
     Union,
 )
@@ -45,7 +43,6 @@
 from pymbolic.primitives import Call, Variable
 
 from loopy.diagnostic import DirectCallUncachedWarning, LoopyError
-from loopy.kernel import LoopKernel
 from loopy.kernel.function_interface import (
     CallableKernel,
     InKernelCallable,
@@ -57,10 +54,11 @@
     RuleAwareIdentityMapper,
     SubstitutionRuleMappingContext,
 )
-from loopy.target import TargetBase
 
 
 if TYPE_CHECKING:
+    from loopy.kernel import LoopKernel
+    from loopy.target import TargetBase
     from loopy.target.execution import ExecutorBase
 
 
@@ -237,7 +235,7 @@ class TranslationUnit:
 
     callables_table: ConcreteCallablesTable
     target: TargetBase
-    entrypoints: FrozenSet[str]
+    entrypoints: frozenset[str]
 
     def __post_init__(self):
 
@@ -336,6 +334,7 @@ def default_entrypoint(self) -> LoopKernel:
             ep_name, = self.entrypoints
             entrypoint = self[ep_name]
 
+            from loopy import LoopKernel
             if not isinstance(entrypoint, LoopKernel):
                 raise ValueError("default entrypoint is not a kernel")
 
@@ -346,7 +345,7 @@ def default_entrypoint(self) -> LoopKernel:
                              " determined.")
 
     def executor(self,
-                 *args, entrypoint: Optional[str] = None, **kwargs) -> ExecutorBase:
+                 *args, entrypoint: str | None = None, **kwargs) -> ExecutorBase:
         """Return an object that hosts caches of compiled code for execution (i.e.
         a subclass of :class:`ExecutorBase`, specific to an execution
         environment (e.g. an OpenCL context) and a given entrypoint.
@@ -584,9 +583,9 @@ class CallablesInferenceContext:
     """
     callables: Mapping[str, InKernelCallable]
     clbl_name_gen: Callable[[str], str]
-    renames: Mapping[str, FrozenSet[str]] = field(
+    renames: Mapping[str, frozenset[str]] = field(
             default_factory=lambda: collections.defaultdict(frozenset))
-    new_entrypoints: FrozenSet[str] = frozenset()
+    new_entrypoints: frozenset[str] = frozenset()
 
     def copy(self, **kwargs: Any) -> CallablesInferenceContext:
         return replace(self, **kwargs)
@@ -749,7 +748,7 @@ def __getitem__(self, name):
 # }}}
 
 
-TUnitOrKernelT = TypeVar("TUnitOrKernelT", LoopKernel, TranslationUnit)
+TUnitOrKernelT = TypeVar("TUnitOrKernelT", "LoopKernel", TranslationUnit)
 
 
 # {{{ helper functions
@@ -778,6 +777,7 @@ def _collective_check(
                 *args: P.args,
                 **kwargs: P.kwargs
             ) -> None:
+        from loopy import LoopKernel
         if isinstance(t_unit_or_kernel, TranslationUnit):
             for clbl in t_unit_or_kernel.callables_table.values():
                 if isinstance(clbl, CallableKernel):
@@ -807,6 +807,7 @@ def _collective_transform(
                 *args: P.args,
                 **kwargs: P.kwargs
             ) -> TUnitOrKernelT:
+        from loopy import LoopKernel
         if isinstance(t_unit_or_kernel, TranslationUnit):
             t_unit = t_unit_or_kernel
             new_callables = {}
@@ -886,7 +887,7 @@ def resolve_callables(t_unit: TranslationUnit) -> TranslationUnit:
     # get loopy specific callables
     known_callables.update(get_loopy_callables())
 
-    callables_table = {}
+    callables_table: dict[FunctionIdT, InKernelCallable] = {}
 
     # callables: name of the calls seen in the program
     callables = {name for name, clbl in t_unit.callables_table.items()
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 57548ab6f..b3c6ffddd 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012-16 Andreas Kloeckner"
 
 __license__ = """
@@ -100,7 +103,7 @@ def map_call(self, expr, expn_state):
             else:
                 return super().map_call(expr, expn_state)
         else:
-            return self.map_substitution(name, tag, expr.parameters, expn_state)
+            return self.map_subst_rule(name, tag, expr.parameters, expn_state)
 
     def map_call_with_kwargs(self, expr):
         # See https://github.com/inducer/loopy/pull/323
@@ -1060,7 +1063,7 @@ def infer_unknown_types(
                 t_unit[e].args if arg.dtype not in (None, auto)}
         new_callable, clbl_inf_ctx = t_unit.callables_table[e].with_types(
                 arg_id_to_dtype, clbl_inf_ctx)
-        clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable,
+        clbl_inf_ctx, _new_name = clbl_inf_ctx.with_callable(e, new_callable,
                                                             is_entrypoint=True)
         if expect_completion:
             from loopy.types import LoopyType
diff --git a/loopy/types.py b/loopy/types.py
index 223b59cc5..fd7db51fa 100644
--- a/loopy/types.py
+++ b/loopy/types.py
@@ -203,13 +203,13 @@ def __eq__(self, other: object) -> bool:
 # }}}
 
 
-ToLoopyTypeConvertible: TypeAlias = Union[Type[auto], None, np.dtype, LoopyType]
+ToLoopyTypeConvertible: TypeAlias = Union[Type[auto], np.dtype, LoopyType, None]
 
 
 def to_loopy_type(dtype: ToLoopyTypeConvertible,
                   allow_auto: bool = False, allow_none: bool = False,
                   for_atomic: bool = False
-                  ) -> Union[Type[auto], None, LoopyType]:
+                  ) -> type[auto] | LoopyType | None:
     if dtype is None:
         if allow_none:
             return None
@@ -262,7 +262,7 @@ def to_loopy_type(dtype: ToLoopyTypeConvertible,
         }
 
 
-def to_unsigned_dtype(dtype: "np.dtype[Any]") -> "np.dtype[Any]":
+def to_unsigned_dtype(dtype: np.dtype[Any]) -> np.dtype[Any]:
     if dtype.kind == "u":
         return dtype
     if dtype.kind != "i":
diff --git a/loopy/typing.py b/loopy/typing.py
index 7cc7209b9..5316c356b 100644
--- a/loopy/typing.py
+++ b/loopy/typing.py
@@ -1,6 +1,11 @@
 """
-.. autoclass:: ExpressionT
+.. autoclass:: Expression
 .. autoclass:: ShapeType
+.. autodata:: InameStr
+.. autodata:: InameStrSet
+
+.. currentmodule:: loopy
+
 .. autoclass:: auto
 """
 
@@ -31,21 +36,22 @@
 """
 
 
-from typing import Optional, Tuple, TypeVar
+from typing import Tuple, TypeVar
 
 import numpy as np
 from typing_extensions import TypeAlias, TypeIs
 
-from pymbolic.primitives import Expression
-from pymbolic.typing import ArithmeticExpressionT, ExpressionT, IntegerT
+from pymbolic.primitives import ExpressionNode
+from pymbolic.typing import ArithmeticExpression, Expression, Integer
 
 
 # The Fortran parser may insert dimensions of 'None', but I'd like to phase
 # that out, so we're not encoding that in the type.
-ShapeType: TypeAlias = Tuple[ArithmeticExpressionT, ...]
+ShapeType: TypeAlias = Tuple[ArithmeticExpression, ...]
 StridesType: TypeAlias = ShapeType
 
 InameStr: TypeAlias = str
+InameStrSet: TypeAlias = frozenset[InameStr]
 
 
 class auto:  # noqa
@@ -58,7 +64,7 @@ class auto:  # noqa
 T = TypeVar("T")
 
 
-def not_none(obj: Optional[T]) -> T:
+def not_none(obj: T | None) -> T:
     assert obj is not None
     return obj
 
@@ -67,15 +73,15 @@ def is_integer(obj: object) -> TypeIs[int | np.integer]:
     return isinstance(obj, (int, np.integer))
 
 
-def integer_or_err(expr: ExpressionT) -> IntegerT:
+def integer_or_err(expr: Expression) -> Integer:
     if isinstance(expr, (int, np.integer)):
         return expr
     else:
         raise ValueError(f"expected integer, got {type(expr)}")
 
 
-def integer_expr_or_err(expr: ExpressionT) -> IntegerT | Expression:
-    if isinstance(expr, (int, np.integer, Expression)):
+def integer_expr_or_err(expr: Expression) -> Integer | ExpressionNode:
+    if isinstance(expr, (int, np.integer, ExpressionNode)):
         return expr
     else:
         raise ValueError(f"expected integer or expression, got {type(expr)}")
diff --git a/loopy/version.py b/loopy/version.py
index 609e6c179..8e350caf8 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
 __license__ = """
@@ -26,7 +29,7 @@
 
 
 VERSION_TEXT = metadata.version("loopy")
-_match = re.match("^([0-9.]+)([a-z0-9]*?)$", VERSION_TEXT)
+_match = re.match(r"^([0-9.]+)([a-z0-9]*?)$", VERSION_TEXT)
 assert _match is not None
 VERSION_STATUS = _match.group(2)
 VERSION = tuple(int(nr) for nr in _match.group(1).split("."))
diff --git a/proto-tests/test_fem_assembly.py b/proto-tests/test_fem_assembly.py
index 9103c42cc..0a28d5ccb 100644
--- a/proto-tests/test_fem_assembly.py
+++ b/proto-tests/test_fem_assembly.py
@@ -25,10 +25,10 @@ def test_laplacian_stiffness(ctx_factory):
     knl = lp.make_kernel(ctx.devices[0],
             "[Nc] -> {[K,i,j,q, dx_axis, ax_b]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d "  # noqa
             "and 0<= dx_axis, ax_b < %(dim)d}"
-            % dict(Nb=Nb, Nq=Nq, dim=dim),
+            % {"Nb": Nb, "Nq": Nq, "dim": dim},
             [
                 "dPsi(ij, dxi) := sum_float32(@ax_b,"
-                    "  jacInv[ax_b,dxi,K,q] * DPsi[ax_b,ij,q])",  # noqa
+                    "  jacInv[ax_b,dxi,K,q] * DPsi[ax_b,ij,q])",
                 "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * ("
                     "sum_float32(dx_axis, dPsi$one(i,dx_axis)*dPsi$two(j,dx_axis))))"
                 ],
@@ -42,7 +42,7 @@ def test_laplacian_stiffness(ctx_factory):
             ],
             name="lapquad", assumptions="Nc>=1")
 
-    knl = lp.tag_inames(knl, dict(ax_b="unr"))
+    knl = lp.tag_inames(knl, {"ax_b": "unr"})
     seq_knl = knl
 
     def variant_fig31(knl):
@@ -77,7 +77,7 @@ def variant_fig33(knl):
         Ncloc = 16  # noqa
         knl = lp.split_iname(knl, "K", Ncloc,
                 outer_iname="Ko", inner_iname="Kloc")
-        knl = lp.precompute(knl, "dPsi$one", np.float32, ["dx_axis"], default_tag=None)  # noqa
+        knl = lp.precompute(knl, "dPsi$one", np.float32, ["dx_axis"], default_tag=None)
         knl = lp.tag_inames(knl, {"j": "ilp.seq"})
 
         return knl, ["Ko", "Kloc"]
@@ -123,7 +123,7 @@ def variant_simple_gpu_prefetch(knl):
         var_knl, loop_prio = variant(knl)
         kernel_gen = lp.generate_loop_schedules(var_knl,
                 loop_priority=loop_prio)
-        kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc))
+        kernel_gen = lp.check_kernels(kernel_gen, {"Nc": Nc})
 
         # print lp.preprocess_kernel(var_knl)
 
diff --git a/proto-tests/test_sem.py b/proto-tests/test_sem.py
index d87126cfb..b1a2a9a22 100644
--- a/proto-tests/test_sem.py
+++ b/proto-tests/test_sem.py
@@ -53,7 +53,7 @@ def test_laplacian(ctx_factory):
             [
             lp.GlobalArg("u", dtype, shape=field_shape, order=order),
             lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
-            lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order),
+            lp.GlobalArg("G", dtype, shape=(6, *field_shape), order=order),
             lp.GlobalArg("D", dtype, shape=(n, n), order=order),
             lp.ValueArg("K", np.int32, approximately=1000),
             ],
@@ -96,11 +96,11 @@ def test_laplacian(ctx_factory):
     # print lp.preprocess_kernel(knl)
     # 1/0
 
-    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
+    knl = lp.tag_inames(knl, {"i": "l.0", "j": "l.1"})
 
     kernel_gen = lp.generate_loop_schedules(knl,
             loop_priority=["m_fetch_G", "i_fetch_u"])
-    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))
+    kernel_gen = lp.check_kernels(kernel_gen, {"K": 1000})
 
     K = 1000  # noqa
     lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
@@ -139,7 +139,7 @@ def test_laplacian_lmem(ctx_factory):
             [
             lp.GlobalArg("u", dtype, shape=field_shape, order=order),
             lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
-            lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order),
+            lp.GlobalArg("G", dtype, shape=(6, *field_shape), order=order),
             lp.GlobalArg("D", dtype, shape=(n, n), order=order),
             lp.ValueArg("K", np.int32, approximately=1000),
             ],
@@ -173,10 +173,10 @@ def test_laplacian_lmem(ctx_factory):
     # print lp.preprocess_kernel(knl)
     # 1/0
 
-    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
+    knl = lp.tag_inames(knl, {"i": "l.0", "j": "l.1"})
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))
+    kernel_gen = lp.check_kernels(kernel_gen, {"K": 1000})
 
     K = 1000  # noqa
     lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
@@ -216,7 +216,7 @@ def test_laplacian_lmem_ilp(ctx_factory):
             [
             lp.GlobalArg("u", dtype, shape=field_shape, order=order),
             lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
-            lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order),
+            lp.GlobalArg("G", dtype, shape=(6, *field_shape), order=order),
             lp.GlobalArg("D", dtype, shape=(n, n), order=order),
             lp.ValueArg("K", np.int32, approximately=1000),
             ],
@@ -246,10 +246,10 @@ def test_laplacian_lmem_ilp(ctx_factory):
     # print seq_knl
     # 1/0
 
-    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
+    knl = lp.tag_inames(knl, {"i": "l.0", "j": "l.1"})
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))
+    kernel_gen = lp.check_kernels(kernel_gen, {"K": 1000})
 
     for knl in kernel_gen:
         print(lp.generate_code(knl))
@@ -320,7 +320,7 @@ def test_advect(ctx_factory):
             lp.GlobalArg("Nu",  dtype, shape=field_shape, order=order),
             lp.GlobalArg("Nv",  dtype, shape=field_shape, order=order),
             lp.GlobalArg("Nw",  dtype, shape=field_shape, order=order),
-            lp.GlobalArg("G",   dtype, shape=(9,)+field_shape, order=order),
+            lp.GlobalArg("G",   dtype, shape=(9, *field_shape), order=order),
             lp.GlobalArg("D",   dtype, shape=(N, N),  order=order),
             lp.ValueArg("K",  np.int32, approximately=1000),
             ],
@@ -333,10 +333,10 @@ def test_advect(ctx_factory):
 
     knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  # , slabs=(0, 1))
 
-    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
+    knl = lp.tag_inames(knl, {"i": "l.0", "j": "l.1"})
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)
+    kernel_gen = lp.check_kernels(kernel_gen, {"K": 1000}, kill_level_min=5)
 
     K = 1000  # noqa
     lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
@@ -359,7 +359,6 @@ def test_advect_dealias(ctx_factory):
     K_sym = var("K")  # noqa
 
     field_shape = (N, N, N, K_sym)
-    interim_field_shape = (M, M, M, K_sym)  # noqa
 
     # 1. direction-by-direction similarity transform on u
     # 2. invert diagonal
@@ -447,13 +446,13 @@ def test_advect_dealias(ctx_factory):
 
     knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  # , slabs=(0, 1))
 
-    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
+    knl = lp.tag_inames(knl, {"i": "l.0", "j": "l.1"})
 
     print(knl)
     # 1/0
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)
+    kernel_gen = lp.check_kernels(kernel_gen, {"K": 1000}, kill_level_min=5)
 
     K = 1000  # noqa
     lp.auto_test_vs_ref(knl, ctx, kernel_gen,
@@ -508,13 +507,13 @@ def test_interp_diff(ctx_factory):
 
     knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  # , slabs=(0, 1))
 
-    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
+    knl = lp.tag_inames(knl, {"i": "l.0", "j": "l.1"})
 
     print(knl)
     # 1/0
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)
+    kernel_gen = lp.check_kernels(kernel_gen, {"K": 1000}, kill_level_min=5)
 
     K = 1000  # noqa
     lp.auto_test_vs_ref(knl, ctx, kernel_gen,
diff --git a/proto-tests/test_sem_tim.py b/proto-tests/test_sem_tim.py
index 2949b39d3..97dc03be9 100644
--- a/proto-tests/test_sem_tim.py
+++ b/proto-tests/test_sem_tim.py
@@ -53,7 +53,7 @@ def test_laplacian(ctx_factory):
             [
             lp.ArrayArg("u", dtype, shape=field_shape, order=order),
             lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
-            lp.ArrayArg("G", dtype, shape=(6,)+field_shape, order=order),
+            lp.ArrayArg("G", dtype, shape=(6, *field_shape), order=order),
             lp.ArrayArg("D", dtype, shape=(n, n), order=order),
             lp.ValueArg("K", np.int32, approximately=1000),
             ],
@@ -98,11 +98,11 @@ def test_laplacian(ctx_factory):
     # print(lp.preprocess_kernel(knl))
     # 1/0
 
-    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
+    knl = lp.tag_inames(knl, {"i": "l.0", "j": "l.1"})
 
     kernel_gen = lp.generate_loop_schedules(knl,
             loop_priority=["m_fetch_G", "i_fetch_u"])
-    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))
+    kernel_gen = lp.check_kernels(kernel_gen, {"K": 1000})
 
     K = 1000  # noqa
     lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
@@ -140,7 +140,7 @@ def test_laplacian_lmem(ctx_factory):
             [
             lp.ArrayArg("u", dtype, shape=field_shape, order=order),
             lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
-            lp.ArrayArg("G", dtype, shape=(6,)+field_shape, order=order),
+            lp.ArrayArg("G", dtype, shape=(6, *field_shape), order=order),
             lp.ArrayArg("D", dtype, shape=(n, n), order=order),
             lp.ValueArg("K", np.int32, approximately=1000),
             ],
@@ -188,7 +188,7 @@ def test_laplacian_lmem(ctx_factory):
 #    knl = lp.tag_inames(knl, dict(i_and_j="l.0", k="l.1"))
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))
+    kernel_gen = lp.check_kernels(kernel_gen, {"K": 1000})
 
     K = 1000  # noqa
     lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
@@ -230,7 +230,7 @@ def test_laplacian_lmem_ilp(ctx_factory):
             [
             lp.ArrayArg("u", dtype, shape=field_shape, order=order),
             lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
-            lp.ArrayArg("G", dtype, shape=(6,)+field_shape, order=order),
+            lp.ArrayArg("G", dtype, shape=(6, *field_shape), order=order),
             lp.ArrayArg("D", dtype, shape=(n, n), order=order),
             lp.ValueArg("K", np.int32, approximately=1000),
             ],
@@ -254,10 +254,10 @@ def test_laplacian_lmem_ilp(ctx_factory):
     # print(seq_knl)
     # 1/0
 
-    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
+    knl = lp.tag_inames(knl, {"i": "l.0", "j": "l.1"})
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))
+    kernel_gen = lp.check_kernels(kernel_gen, {"K": 1000})
 
     for knl in kernel_gen:
         print(lp.generate_code(knl))
@@ -328,7 +328,7 @@ def test_advect(ctx_factory):
             lp.ArrayArg("Nu",  dtype, shape=field_shape, order=order),
             lp.ArrayArg("Nv",  dtype, shape=field_shape, order=order),
             lp.ArrayArg("Nw",  dtype, shape=field_shape, order=order),
-            lp.ArrayArg("G",   dtype, shape=(9,)+field_shape, order=order),
+            lp.ArrayArg("G",   dtype, shape=(9, *field_shape), order=order),
             lp.ArrayArg("D",   dtype, shape=(N, N),  order=order),
             lp.ValueArg("K",  np.int32, approximately=1000),
             ],
@@ -341,10 +341,10 @@ def test_advect(ctx_factory):
 
     knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  # , slabs=(0, 1))
 
-    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
+    knl = lp.tag_inames(knl, {"i": "l.0", "j": "l.1"})
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)
+    kernel_gen = lp.check_kernels(kernel_gen, {"K": 1000}, kill_level_min=5)
 
     K = 1000  # noqa
     lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
@@ -367,7 +367,6 @@ def test_advect_dealias(ctx_factory):
     K_sym = var("K")  # noqa
 
     field_shape = (N, N, N, K_sym)
-    interim_field_shape = (M, M, M, K_sym)  # noqa
 
     # 1. direction-by-direction similarity transform on u
     # 2. invert diagonal
@@ -455,13 +454,13 @@ def test_advect_dealias(ctx_factory):
 
     knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  # , slabs=(0, 1))
 
-    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
+    knl = lp.tag_inames(knl, {"i": "l.0", "j": "l.1"})
 
     print(knl)
     # 1/0
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)
+    kernel_gen = lp.check_kernels(kernel_gen, {"K": 1000}, kill_level_min=5)
 
     K = 1000  # noqa
     lp.auto_test_vs_ref(knl, ctx, kernel_gen,
@@ -516,13 +515,13 @@ def test_interp_diff(ctx_factory):
 
     knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  # , slabs=(0, 1))
 
-    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
+    knl = lp.tag_inames(knl, {"i": "l.0", "j": "l.1"})
 
     print(knl)
     # 1/0
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)
+    kernel_gen = lp.check_kernels(kernel_gen, {"K": 1000}, kill_level_min=5)
 
     K = 1000  # noqa
 
diff --git a/proto-tests/test_tim.py b/proto-tests/test_tim.py
index eb8125cdb..a5bea3ac4 100644
--- a/proto-tests/test_tim.py
+++ b/proto-tests/test_tim.py
@@ -38,7 +38,7 @@ def test_tim2d(ctx_factory):
             [
             lp.ArrayArg("u", dtype, shape=field_shape, order=order),
             lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
-            lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order),
+            lp.ArrayArg("G", dtype, shape=(3, *field_shape), order=order),
             # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order),
             lp.ArrayArg("D", dtype, shape=(n, n), order=order),
             # lp.ImageArg("D", dtype, shape=(n, n)),
@@ -46,8 +46,6 @@ def test_tim2d(ctx_factory):
             ],
             name="semlap2D", assumptions="K>=1")
 
-    unroll = 32  # noqa
-
     seq_knl = knl
     knl = lp.add_prefetch(knl, "D", ["m", "j", "i", "o"], default_tag="l.auto")
     knl = lp.add_prefetch(knl, "u", ["i", "j",  "o"], default_tag="l.auto")
@@ -55,16 +53,16 @@ def test_tim2d(ctx_factory):
     knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto")
     knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")  # , slabs=(0, 1))
 
-    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
-    knl = lp.tag_inames(knl, dict(o="unr"))
-    knl = lp.tag_inames(knl, dict(m="unr"))
+    knl = lp.tag_inames(knl, {"i": "l.0", "j": "l.1"})
+    knl = lp.tag_inames(knl, {"o": "unr"})
+    knl = lp.tag_inames(knl, {"m": "unr"})
 
 
 #    knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G  # noqa
     knl = lp.add_prefetch(knl, "G", [2, 3], default_tag="l.auto")  # axis/argument indices on G  # noqa
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))
+    kernel_gen = lp.check_kernels(kernel_gen, {"K": 1000})
 
     K = 1000  # noqa
     lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
@@ -101,14 +99,12 @@ def test_red2d(ctx_factory):
             [
             lp.ArrayArg("u", dtype, shape=field_shape, order=order),
             lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
-            lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order),
+            lp.ArrayArg("G", dtype, shape=(3, *field_shape), order=order),
             lp.ArrayArg("D", dtype, shape=(n, n), order=order),
             lp.ValueArg("K", np.int32, approximately=1000),
             ],
             name="semlap2D", assumptions="K>=1")
 
-    unroll = 32  # noqa
-
     seq_knl = knl
     knl = lp.add_prefetch(knl, "D", ["m", "j", "i", "o"], default_tag="l.auto")
     knl = lp.add_prefetch(knl, "u", ["i", "j",  "o"], default_tag="l.auto")
@@ -122,13 +118,13 @@ def test_red2d(ctx_factory):
     knl = lp.split_iname(knl, "j", n, inner_tag="l.0")  # , slabs=(0, 1))
     knl = lp.split_iname(knl, "i", n, inner_tag="l.1")  # , slabs=(0, 1))
 
-    knl = lp.tag_inames(knl, dict(o="unr"))
-    knl = lp.tag_inames(knl, dict(m="unr"))
+    knl = lp.tag_inames(knl, {"o": "unr"})
+    knl = lp.tag_inames(knl, {"m": "unr"})
 
     knl = lp.add_prefetch(knl, "G", [2, 3], default_tag="l.auto")  # axis/argument indices on G  # noqa
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))
+    kernel_gen = lp.check_kernels(kernel_gen, {"K": 1000})
 
     K = 1000  # noqa
     lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
@@ -168,7 +164,7 @@ def test_tim3d(ctx_factory):
             lp.ArrayArg("u", dtype, shape=field_shape, order=order),
             lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
 
-            lp.ArrayArg("G", dtype, shape=(6,)+field_shape, order=order),
+            lp.ArrayArg("G", dtype, shape=(6, *field_shape), order=order),
             # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order),
             lp.ArrayArg("D", dtype, shape=(n, n), order=order),
             # lp.ImageArg("D", dtype, shape=(n, n)),
@@ -192,14 +188,14 @@ def test_tim3d(ctx_factory):
 
 #    knl = lp.tag_inames(knl, dict(k_inner="unr"))
 
-    knl = lp.tag_inames(knl, dict(o="unr"))
-    knl = lp.tag_inames(knl, dict(m="unr"))
+    knl = lp.tag_inames(knl, {"o": "unr"})
+    knl = lp.tag_inames(knl, {"m": "unr"})
 #    knl = lp.tag_inames(knl, dict(i="unr"))
 
     knl = lp.add_prefetch(knl, "G", [2, 3, 4], default_tag="l.auto")  # axis/argument indices on G  # noqa
 
     kernel_gen = lp.generate_loop_schedules(knl)
-    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))
+    kernel_gen = lp.check_kernels(kernel_gen, {"K": 1000})
 
     K = 4000  # noqa
     lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
diff --git a/pyproject.toml b/pyproject.toml
index 3204163f0..d61564a70 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,8 +1,6 @@
 [build-system]
-build-backend = "setuptools.build_meta"
-requires = [
-    "setuptools>=63",
-]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
 
 [project]
 name = "loopy"
@@ -32,7 +30,7 @@ classifiers = [
 ]
 dependencies = [
     "pytools>=2024.1.5",
-    "pymbolic>=2024.1",
+    "pymbolic>=2024.2.2",
     "genpy>=2016.1.2",
 
     # https://github.com/inducer/loopy/pull/419
@@ -62,22 +60,22 @@ fortran = [
     "ply>=3.6",
 ]
 
+[tool.hatch.metadata]
+allow-direct-references = true
+
+[tool.hatch.build.targets.sdist]
+exclude = [
+  "/.git*",
+  "/doc/_build",
+  "/run-*.sh",
+]
+
 [project.scripts]
 
 [project.urls]
 Documentation = "https://documen.tician.de/loopy"
 Homepage = "https://github.com/inducer/loopy"
 
-[tool.setuptools.packages.find]
-include = [
-    "loopy*",
-]
-
-[tool.setuptools.package-data]
-loopy = [
-    "py.typed",
-]
-
 [tool.setuptools.package-dir]
 # https://github.com/Infleqtion/client-superstaq/pull/715
 "" = "."
@@ -99,11 +97,10 @@ extend-select = [
     "Q",   # flake8-quotes
     "W",   # pycodestyle
 
-    # TODO
-    # "UP",  # pyupgrade
-    # "RUF",  # ruff
-
-    "RUF022", # __all__ isn't sorted
+    "UP",  # pyupgrade
+    "RUF",  # ruff
+    "FA",
+    "TC",
 ]
 extend-ignore = [
     "C90",  # McCabe complexity
@@ -115,8 +112,8 @@ extend-ignore = [
 
     # FIXME
     "NPY002", # numpy rng
-    "C408", # unnecssary dict() -> literal
-    "F841", # local variable unused
+    "UP031", # .format instead of %s
+    "UP032", # .format instead of %s
 ]
 
 [tool.ruff.lint.per-file-ignores]
@@ -124,6 +121,12 @@ extend-ignore = [
 "loopy/target/c/compyte/ndarray/**/*.py" = ["Q", "B", "E", "F", "N", "C4"]
 "loopy/frontend/fortran/translator.py" = ["N802", "B028"]
 "proto-tests/*.py" = ["B"]
+"contrib/**/*.py" = ["I002"]
+"doc/conf.py" = ["I002"]
+"*.ipynb" = ["I002"]
+"examples/**/*.py" = ["I002"]
+"proto-tests/**/*.py" = ["I002"]
+"test/**/*.py" = ["I002"]
 
 [tool.ruff.lint.flake8-quotes]
 docstring-quotes = "double"
@@ -144,6 +147,7 @@ known-local-folder = [
     "loopy",
 ]
 lines-after-imports = 2
+required-imports = ["from __future__ import annotations"]
 
 [tool.mypy]
 python_version = "3.10"
@@ -153,24 +157,23 @@ warn_unused_ignores = true
 # check_untyped_defs = true
 
 exclude = [
-  "loopy/target/c/compyte/ndarray/.*",
   "loopy/target/c/compyte/array.py",
 ]
 
 [[tool.mypy.overrides]]
 module = [
+    "loopy.symbolic",
+]
+# check_untyped_defs = true
+
+[[tool.mypy.overrides]]
+module = [
+    "IPython.*",
+    "fparser.*",
     "islpy.*",
-    "pymbolic.*",
-    "cgen.*",
-    "genpy.*",
-    "pyopencl.*",
-    "colorama.*",
-    "codepy.*",
     "mako.*",
-    "fparser.*",
     "ply.*",
-    "pygments.*",
-    "IPython.*",
+    "pyopencl.*",
 ]
 ignore_missing_imports = true
 
@@ -190,6 +193,8 @@ dout = "dout"
 ue = "ue"
 # used in an ordering context, "ab" / "ba"
 ba = "ba"
+# Fortran Loopy
+floopy = "floopy"
 
 "dependees" = "dependees"
 
diff --git a/test/gnuma_loopy_transforms.py b/test/gnuma_loopy_transforms.py
index 9c4400406..1b8842a66 100644
--- a/test/gnuma_loopy_transforms.py
+++ b/test/gnuma_loopy_transforms.py
@@ -39,6 +39,6 @@ def set_D_storage_format(kernel):  # noqa: N802
 def set_up_volume_loop(kernel, Nq):  # noqa
     kernel = lp.fix_parameters(kernel, Nq=Nq)
     kernel = lp.prioritize_loops(kernel, "e,k,j,i")
-    kernel = lp.tag_inames(kernel, dict(e="g.0", j="l.1", i="l.0"))
+    kernel = lp.tag_inames(kernel, {"e": "g.0", "j": "l.1", "i": "l.0"})
     kernel = lp.assume(kernel, "elements >= 1")
     return kernel
diff --git a/test/test_apps.py b/test/test_apps.py
index c4cffaee1..8c32aa9ac 100644
--- a/test/test_apps.py
+++ b/test/test_apps.py
@@ -27,7 +27,7 @@
 import pytest
 
 import pyopencl as cl
-import pyopencl.clmath  # noqa
+import pyopencl.clmath
 import pyopencl.clrandom  # noqa
 
 import loopy as lp
@@ -102,7 +102,7 @@ def variant_1(knl):
     def variant_2(knl):
         knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0")
         knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1")
-        knl = lp.tag_inames(knl, dict(ifeat="g.2"))
+        knl = lp.tag_inames(knl, {"ifeat": "g.2"})
         knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]",
                 fetch_outer_inames="im_x_outer, im_y_outer, ifeat",
                 default_tag="l.auto")
@@ -117,10 +117,10 @@ def variant_2(knl):
             variant_2
             ]:
         lp.auto_test_vs_ref(ref_knl, ctx, variant(knl),
-                parameters=dict(
-                    im_w=128, im_h=128, f_w=f_w,
-                    nfeats=3, nimgs=3
-                    ))
+                parameters={
+                    "im_w": 128, "im_h": 128, "f_w": f_w,
+                    "nfeats": 3, "nimgs": 3
+                    })
 
 
 def test_convolution_with_nonzero_base(ctx_factory):
@@ -175,10 +175,10 @@ def variant_1(knl):
             variant_1,
             ]:
         lp.auto_test_vs_ref(ref_knl, ctx, variant(knl),
-                parameters=dict(
-                    im_w=128, im_h=128, f_w=f_w,
-                    nfeats=12, nimgs=17
-                    ))
+                parameters={
+                    "im_w": 128, "im_h": 128, "f_w": f_w,
+                    "nfeats": 12, "nimgs": 17
+                    })
 
 # }}}
 
@@ -227,12 +227,12 @@ def test_rob_stroud_bernstein():
     knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
     knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
             slabs=(0, 1))
-    knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))
-    knl = lp.add_dtypes(knl, dict(
-                qpts=np.float32,
-                coeffs=np.float32,
-                tmp=np.float32,
-                ))
+    knl = lp.tag_inames(knl, {"i2": "l.1", "alpha1": "unr", "alpha2": "unr"})
+    knl = lp.add_dtypes(knl, {
+                "qpts": np.float32,
+                "coeffs": np.float32,
+                "tmp": np.float32,
+                })
     print(lp.generate_code_v2(knl))
 
 
@@ -306,18 +306,18 @@ def test_rob_stroud_bernstein_full():
         knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
         knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
                 slabs=(0, 1))
-        knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))
+        knl = lp.tag_inames(knl, {"i2": "l.1", "alpha1": "unr", "alpha2": "unr"})
 
     from pickle import dumps, loads
     knl = loads(dumps(knl))
 
     knl = lp.add_dtypes(knl,
-            dict(
-                qpts=np.float32,
-                tmp=np.float32,
-                coeffs=np.float32,
-                result=np.float32,
-                ))
+            {
+                "qpts": np.float32,
+                "tmp": np.float32,
+                "coeffs": np.float32,
+                "result": np.float32,
+                })
     print(lp.generate_code_v2(knl))
 
 
@@ -393,7 +393,7 @@ def test_stencil_with_overfetch(ctx_factory):
         # https://github.com/pocl/pocl/issues/205
         pytest.skip("takes very long to compile on pocl")
 
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
 
     ref_knl = knl
 
@@ -411,7 +411,7 @@ def variant_overfetch(knl):
         n = 200
         lp.auto_test_vs_ref(ref_knl, ctx, variant(knl),
                 print_ref_code=False,
-                op_count=[n*n], parameters=dict(n=n), op_label=["cells"])
+                op_count=[n*n], parameters={"n": n}, op_label=["cells"])
 
 
 def test_sum_factorization():
@@ -531,8 +531,8 @@ def test_fd_demo():
     # u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32)
 
     knl = lp.set_options(knl, write_code=True)
-    knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32))
-    code, inf = lp.generate_code(knl)
+    knl = lp.add_and_infer_dtypes(knl, {"u": np.float32})
+    code, _inf = lp.generate_code(knl)
     print(code)
 
     assert "double" not in code
@@ -555,7 +555,7 @@ def test_fd_1d(ctx_factory):
 
     lp.auto_test_vs_ref(
             ref_knl, ctx, knl,
-            parameters=dict(n=2048))
+            parameters={"n": 2048})
 
 
 def test_poisson_fem(ctx_factory):
@@ -600,12 +600,12 @@ def variant_2(knl):
         return knl
 
     def add_types(knl):
-        return lp.add_and_infer_dtypes(knl, dict(
-            w=np.float32,
-            J=np.float32,
-            DPsi=np.float32,
-            DFinv=np.float32,
-            ))
+        return lp.add_and_infer_dtypes(knl, {
+            "w": np.float32,
+            "J": np.float32,
+            "DPsi": np.float32,
+            "DFinv": np.float32,
+            })
 
     for variant in [
             # variant_1,
@@ -615,7 +615,7 @@ def add_types(knl):
 
         lp.auto_test_vs_ref(
                 add_types(ref_knl), ctx, add_types(knl),
-                parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))
+                parameters={"n": 5, "nels": 15, "nbf": 5, "sdim": 2, "nqp": 7})
 
 
 def test_domain_tree_nesting():
diff --git a/test/test_c_execution.py b/test/test_c_execution.py
index 6208b9aed..9943d41df 100644
--- a/test/test_c_execution.py
+++ b/test/test_c_execution.py
@@ -365,7 +365,7 @@ def test_one_length_loop():
 
 def test_scalar_global_args():
     n = np.random.default_rng().integers(30, 100)
-    evt, (out,) = lp.make_kernel(
+    _evt, (out,) = lp.make_kernel(
             "{[i]: 0<=i<n}",
             "res  = sum(i, i)",
             target=lp.ExecutableCTarget())(n=n)
diff --git a/test/test_callables.py b/test/test_callables.py
index 44a94e43a..060147859 100644
--- a/test/test_callables.py
+++ b/test/test_callables.py
@@ -26,7 +26,7 @@
 import pytest
 
 import pyopencl as cl
-import pyopencl.clrandom  # noqa: F401
+import pyopencl.clrandom
 from pyopencl.tools import (  # noqa: F401
     pytest_generate_tests_for_pyopencl as pytest_generate_tests,
 )
@@ -52,7 +52,7 @@ def test_register_function_lookup(ctx_factory):
             """)
     prog = lp.register_callable(prog, "log2", Log2Callable("log2"))
 
-    evt, (out, ) = prog(queue, x=x)
+    _evt, (out, ) = prog(queue, x=x)
 
     assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15
 
@@ -98,7 +98,7 @@ def test_register_knl(ctx_factory, inline):
         knl = lp.inline_callable_kernel(knl, "linear_combo2")
         knl = lp.inline_callable_kernel(knl, "linear_combo1")
 
-    evt, (out, ) = knl(queue, x=x, y=y)
+    _evt, (out, ) = knl(queue, x=x, y=y)
 
     assert (np.linalg.norm(2*x+3*y-out)/(
         np.linalg.norm(2*x+3*y))) < 1e-15
@@ -137,7 +137,7 @@ def test_slices_with_negative_step(ctx_factory, inline):
     if inline:
         knl = lp.inline_callable_kernel(knl, "linear_combo")
 
-    evt, (out, ) = knl(queue, x=x, y=y)
+    _evt, (out, ) = knl(queue, x=x, y=y)
 
     assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/(
         np.linalg.norm(2*x+3*y))) < 1e-15
@@ -176,7 +176,7 @@ def test_register_knl_with_hw_axes(ctx_factory, inline):
     if inline:
         knl = lp.inline_callable_kernel(knl, "linear_combo")
 
-    evt, out = knl(queue, x=x_dev, y=y_dev)
+    _evt, out = knl(queue, x=x_dev, y=y_dev)
 
     x_host = x_dev.get()
     y_host = y_dev.get()
@@ -231,7 +231,7 @@ def test_shape_translation_through_sub_array_ref(ctx_factory, inline):
 
     knl = lp.set_options(knl, write_code=True)
     knl = lp.set_options(knl, return_dict=True)
-    evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3)
+    _evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3)
 
     y1 = out_dict["y1"].get()
     y2 = out_dict["y2"].get()
@@ -284,7 +284,7 @@ def test_multi_arg_array_call(ctx_factory):
 
     knl = lp.merge([knl, argmin_kernel])
     b = np.random.randn(n)
-    evt, out_dict = knl(queue, b=b)
+    _evt, out_dict = knl(queue, b=b)
     tol = 1e-15
     from numpy.linalg import norm
     assert norm(out_dict["min_val"] - np.min(b)) < tol
@@ -330,7 +330,7 @@ def test_packing_unpacking(ctx_factory, inline):
 
     knl = lp.set_options(knl, write_code=True)
     knl = lp.set_options(knl, return_dict=True)
-    evt, out_dict = knl(queue, x1=x1, x2=x2)
+    _evt, out_dict = knl(queue, x1=x1, x2=x2)
 
     y1 = out_dict["y1"].get()
     y2 = out_dict["y2"].get()
@@ -367,7 +367,7 @@ def test_empty_sub_array_refs(ctx_factory, inline):
     if inline:
         caller = lp.inline_callable_kernel(caller, "wence_function")
 
-    evt, (out, ) = caller(queue, x=x, y=y)
+    _evt, (out, ) = caller(queue, x=x, y=y)
     assert np.allclose(out, x-y)
 
 
@@ -403,7 +403,7 @@ def test_array_inputs_to_callee_kernels(ctx_factory, inline):
     if inline:
         knl = lp.inline_callable_kernel(knl, "linear_combo")
 
-    evt, (out, ) = knl(queue, x=x, y=y)
+    _evt, (out, ) = knl(queue, x=x, y=y)
 
     assert (np.linalg.norm(2*x+3*y-out)/(
         np.linalg.norm(2*x+3*y))) < 1e-15
@@ -484,7 +484,7 @@ def test_argument_matching_for_inplace_update(ctx_factory):
     knl = lp.merge([knl, twice])
 
     x = np.random.randn(10)
-    evt, (out, ) = knl(queue, x=np.copy(x))
+    _evt, (out, ) = knl(queue, x=np.copy(x))
 
     assert np.allclose(2*x, out)
 
@@ -507,7 +507,7 @@ def test_non_zero_start_in_subarray_ref(ctx_factory):
     knl = lp.merge([knl, twice])
 
     x = np.random.randn(10)
-    evt, (out, ) = knl(queue, x=np.copy(x))
+    _evt, (out, ) = knl(queue, x=np.copy(x))
 
     assert np.allclose(2*x, out)
 
@@ -558,7 +558,7 @@ def test_callees_with_gbarriers_are_inlined(ctx_factory):
             """, [lp.GlobalArg("y", shape=6, dtype=None)])
 
     t_unit = lp.merge([t_unit, ones_and_zeros])
-    evt, (out,) = t_unit(queue)
+    _evt, (out,) = t_unit(queue)
 
     expected_out = np.array([1, 1, 1, 0, 0, 0]).astype(np.float32)
 
@@ -593,7 +593,7 @@ def test_callees_with_gbarriers_are_inlined_with_nested_calls(ctx_factory):
             """, [lp.GlobalArg("y", shape=6, dtype=None)])
 
     t_unit = lp.merge([t_unit, dummy_ones_and_zeros, ones_and_zeros])
-    evt, (out,) = t_unit(queue)
+    _evt, (out,) = t_unit(queue)
 
     expected_out = np.array([1, 1, 1, 0, 0, 0]).astype(np.float32)
 
@@ -626,7 +626,7 @@ def test_inlining_with_indirections(ctx_factory):
 
     map_in = np.arange(3).astype(np.int32)
 
-    evt, (out, ) = t_unit(queue, mymap=map_in)
+    _evt, (out, ) = t_unit(queue, mymap=map_in)
 
     expected_out = np.array([1, 1, 1, 0, 0, 0]).astype(np.float32)
     assert (expected_out == out).all()
@@ -651,7 +651,7 @@ def test_inlining_with_callee_domain_param(ctx_factory):
 
     caller = lp.merge([caller, fill2])
     caller = lp.inline_callable_kernel(caller, "fill2")
-    evt, (out, ) = caller(queue)
+    _evt, (out, ) = caller(queue)
 
     assert (out == 2).all()
 
@@ -703,7 +703,7 @@ def test_passing_and_getting_scalar_in_clbl_knl(ctx_factory, inline):
     if inline:
         knl = lp.inline_callable_kernel(knl, "call_sin")
 
-    evt, (out,) = knl(cq, real_x=np.asarray(3.0, dtype=float))
+    _evt, (_out,) = knl(cq, real_x=np.asarray(3.0, dtype=float))
 
 
 @pytest.mark.parametrize("inline", [False, True])
@@ -731,7 +731,7 @@ def test_passing_scalar_as_indexed_subcript_in_clbl_knl(ctx_factory, inline):
     if inline:
         knl = lp.inline_callable_kernel(knl, "twice")
 
-    evt, (out,) = knl(cq, X=x_in)
+    _evt, (out,) = knl(cq, X=x_in)
 
     np.testing.assert_allclose(out.get(), 2*x_in)
 
@@ -752,7 +752,7 @@ def test_symbol_mangler_in_call(ctx_factory):
 
     knl = lp.register_preamble_generators(knl, [preamble_for_x])
 
-    evt, (out,) = knl(cq)
+    _evt, (out,) = knl(cq)
     np.testing.assert_allclose(out.get(), np.sin(10))
 
 
@@ -927,7 +927,7 @@ def test_non1_step_slices(ctx_factory, start, inline):
     if inline:
         t_unit = lp.inline_callable_kernel(t_unit, "squared_arange")
 
-    evt, out_dict = t_unit(cq)
+    _evt, out_dict = t_unit(cq)
 
     np.testing.assert_allclose(out_dict["X"].get(), expected_out1)
     np.testing.assert_allclose(out_dict["Y"].get(), expected_out2)
@@ -1012,7 +1012,7 @@ def test_callee_with_parameter_and_grid(ctx_factory):
     knl = lp.split_iname(knl, "i", 2,
                          outer_tag="g.0", within="in_kernel:arange")
 
-    evt, (out,) = knl(cq)
+    _evt, (out,) = knl(cq)
     np.testing.assert_allclose(out.get(), np.arange(10))
 
 
@@ -1255,7 +1255,7 @@ def test_call_kernel_w_preds(ctx_factory, inline):
     if inline:
         knl = lp.inline_callable_kernel(knl, "twice")
 
-    evt, (out,) = knl(cq, x=np.ones((10, 10)))
+    _evt, (out,) = knl(cq, x=np.ones((10, 10)))
 
     np.testing.assert_allclose(out[:5], 1)
     np.testing.assert_allclose(out[5:], 2)
@@ -1486,7 +1486,7 @@ def test_subarray_ref_with_repeated_indices(ctx_factory):
             )
     knl = lp.merge([parent_knl, child_knl])
     knl = lp.inline_callable_kernel(knl, "ones")
-    evt, (z_dev,) = knl(cq)
+    _evt, (z_dev,) = knl(cq)
     assert np.allclose(z_dev.get(), np.eye(10))
 
 
diff --git a/test/test_dg.py b/test/test_dg.py
index bc134d9cb..e19507260 100644
--- a/test/test_dg.py
+++ b/test/test_dg.py
@@ -25,7 +25,7 @@
 
 import numpy as np
 import pyopencl as cl
-import pyopencl.array  # noqa
+import pyopencl.array
 from pyopencl.tools import (  # noqa
     pytest_generate_tests_for_pyopencl as pytest_generate_tests,
 )
@@ -79,22 +79,22 @@ def test_dg_volume(ctx_factory):
     seq_knl = knl
 
     def variant_basic(knl):
-        knl = lp.tag_inames(knl, dict(k="g.0", n="l.0"))
+        knl = lp.tag_inames(knl, {"k": "g.0", "n": "l.0"})
         return knl
 
     def variant_more_per_work_group(knl):
-        knl = lp.tag_inames(knl, dict(n="l.0"))
+        knl = lp.tag_inames(knl, {"n": "l.0"})
         knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
         return knl
 
     def variant_image_d(knl):
-        knl = lp.tag_inames(knl, dict(n="l.0"))
+        knl = lp.tag_inames(knl, {"n": "l.0"})
         knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
         knl = lp.change_arg_to_image(knl, "DrDsDt")
         return knl
 
     def variant_prefetch_d(knl):
-        knl = lp.tag_inames(knl, dict(n="l.0"))
+        knl = lp.tag_inames(knl, {"n": "l.0"})
         knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
         knl = lp.add_prefetch(knl, "DrDsDt[:,:]",
                 fetch_outer_inames="k_outer",
@@ -102,7 +102,7 @@ def variant_prefetch_d(knl):
         return knl
 
     def variant_prefetch_fields(knl):
-        knl = lp.tag_inames(knl, dict(n="l.0"))
+        knl = lp.tag_inames(knl, {"n": "l.0"})
         knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
         for name in ["u", "v", "w", "p"]:
             knl = lp.add_prefetch(knl, "%s[k,:]" % name, ["k_inner"],
@@ -111,14 +111,14 @@ def variant_prefetch_fields(knl):
         return knl
 
     def variant_k_ilp(knl):
-        knl = lp.tag_inames(knl, dict(n="l.0"))
+        knl = lp.tag_inames(knl, {"n": "l.0"})
 
         knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="ilp")
-        knl = lp.tag_inames(knl, dict(m="unr"))
+        knl = lp.tag_inames(knl, {"m": "unr"})
         return knl
 
     def variant_simple_padding(knl):
-        knl = lp.tag_inames(knl, dict(n="l.0"))
+        knl = lp.tag_inames(knl, {"n": "l.0"})
 
         knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
 
@@ -130,12 +130,12 @@ def variant_simple_padding(knl):
         for name in arg_names:
             knl = lp.add_padding(knl, name, axis=0, align_bytes=32)
 
-        knl = lp.tag_inames(knl, dict(m="unr"))
+        knl = lp.tag_inames(knl, {"m": "unr"})
 
         return knl
 
     def variant_fancy_padding(knl):
-        knl = lp.tag_inames(knl, dict(n="l.0"))
+        knl = lp.tag_inames(knl, {"n": "l.0"})
 
         pad_mult = lp.find_padding_multiple(knl, "u", 1, 32)
 
@@ -148,7 +148,7 @@ def variant_fancy_padding(knl):
 
         return knl
 
-    parameters_dict = dict(K=K)
+    parameters_dict = {"K": K}
 
     variants = [
             variant_basic,
@@ -232,7 +232,7 @@ def no_test_dg_surface(ctx_factory):
     def variant_basic(knl):
         return knl
 
-    parameters_dict = dict(K=K)
+    parameters_dict = {"K": K}
 
     for variant in [
             variant_basic,
diff --git a/test/test_diff.py b/test/test_diff.py
index 626ddb70e..0f9694cc9 100644
--- a/test/test_diff.py
+++ b/test/test_diff.py
@@ -23,11 +23,11 @@
 import logging
 import sys
 
-import numpy as np  # noqa
+import numpy as np
 import numpy.linalg as la
 
 import pyopencl as cl
-import pyopencl.clrandom  # noqa
+import pyopencl.clrandom
 
 import loopy as lp
 
@@ -69,7 +69,7 @@ def test_diff(ctx_factory):
     from loopy.transform.diff import diff_kernel
     # FIXME Is this the correct interface. Does it make sense to take the entire
     # translation unit?
-    dknl, diff_map = diff_kernel(knl["diff"], "z", "x")
+    dknl, _diff_map = diff_kernel(knl["diff"], "z", "x")
     dknl = knl.with_kernel(dknl)
     dknl = lp.remove_unused_arguments(dknl)
 
@@ -87,12 +87,12 @@ def test_diff(ctx_factory):
     h1 = 1e-4
     h2 = h1 * fac
 
-    evt, (z0,) = knl(queue, x=x, y=y)
-    evt, (z1,) = knl(queue, x=(x + h1*dx), y=y)
-    evt, (z2,) = knl(queue, x=(x + h2*dx), y=y)
+    _evt, (z0,) = knl(queue, x=x, y=y)
+    _evt, (z1,) = knl(queue, x=(x + h1*dx), y=y)
+    _evt, (z2,) = knl(queue, x=(x + h2*dx), y=y)
 
     dknl = lp.set_options(dknl, write_code=True)
-    evt, (df,) = dknl(queue, x=x, y=y)
+    _evt, (df,) = dknl(queue, x=x, y=y)
 
     diff1 = (z1-z0)
     diff2 = (z2-z0)
diff --git a/test/test_domain.py b/test/test_domain.py
index c422e131d..c915498ed 100644
--- a/test/test_domain.py
+++ b/test/test_domain.py
@@ -24,11 +24,11 @@
 import sys
 
 import numpy as np
-import pytest  # noqa
+import pytest
 
 import pyopencl as cl
-import pyopencl.clmath  # noqa
-import pyopencl.clrandom  # noqa
+import pyopencl.clmath
+import pyopencl.clrandom
 
 import loopy as lp
 
@@ -268,7 +268,7 @@ def test_independent_multi_domain(ctx_factory):
     assert knl["loopy_kernel"].parents_per_domain() == 2*[None]
 
     n = 50
-    evt, (a, b) = knl(queue, n=n, out_host=True)
+    _evt, (a, b) = knl(queue, n=n, out_host=True)
 
     assert a.shape == (50,)
     assert b.shape == (50,)
@@ -309,7 +309,7 @@ def test_equality_constraints(ctx_factory):
     # print(knl.domains[0].detect_equalities())
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
-            parameters=dict(n=n), print_ref_code=True)
+            parameters={"n": n}, print_ref_code=True)
 
 
 def test_stride(ctx_factory):
@@ -335,7 +335,7 @@ def test_stride(ctx_factory):
     seq_knl = knl
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
-            parameters=dict(n=n))
+            parameters={"n": n})
 
 
 def test_domain_dependency_via_existentially_quantified_variable(ctx_factory):
@@ -363,7 +363,7 @@ def test_domain_dependency_via_existentially_quantified_variable(ctx_factory):
     seq_knl = knl
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
-            parameters=dict(n=n))
+            parameters={"n": n})
 
 
 def test_triangle_domain():
diff --git a/test/test_einsum.py b/test/test_einsum.py
index f2964a121..65624ba5e 100644
--- a/test/test_einsum.py
+++ b/test/test_einsum.py
@@ -57,7 +57,7 @@ def test_einsum_array_manipulation(ctx_factory, spec):
     arg_names = ("a",)
 
     knl = lp.make_einsum(spec, arg_names)
-    evt, (out,) = knl(queue, a=a)
+    _evt, (out,) = knl(queue, a=a)
     ans = np.einsum(spec, a)
 
     assert np.linalg.norm(out - ans) <= 1e-15
@@ -76,7 +76,7 @@ def test_einsum_array_matvec(ctx_factory, spec):
     arg_names = ("a", "b")
 
     knl = lp.make_einsum(spec, arg_names)
-    evt, (out,) = knl(queue, a=a, b=b)
+    _evt, (out,) = knl(queue, a=a, b=b)
     ans = np.einsum(spec, a, b)
 
     assert np.linalg.norm(out - ans) <= 1e-15
@@ -97,7 +97,7 @@ def test_einsum_array_ops_same_dims(ctx_factory, spec):
     arg_names = ("a", "b")
 
     knl = lp.make_einsum(spec, arg_names)
-    evt, (out,) = knl(queue, a=a, b=b)
+    _evt, (out,) = knl(queue, a=a, b=b)
     ans = np.einsum(spec, a, b)
 
     assert np.linalg.norm(out - ans) <= 1e-15
@@ -118,7 +118,7 @@ def test_einsum_array_ops_diff_dims(ctx_factory, spec):
     arg_names = ("a", "b")
 
     knl = lp.make_einsum(spec, arg_names)
-    evt, (out,) = knl(queue, a=a, b=b)
+    _evt, (out,) = knl(queue, a=a, b=b)
     ans = np.einsum(spec, a, b)
 
     assert np.linalg.norm(out - ans) <= 1e-15
@@ -138,7 +138,7 @@ def test_einsum_array_ops_triple_prod(ctx_factory, spec):
     arg_names = ("a", "b", "c")
 
     knl = lp.make_einsum(spec, arg_names)
-    evt, (out,) = knl(queue, a=a, b=b, c=c)
+    _evt, (out,) = knl(queue, a=a, b=b, c=c)
     ans = np.einsum(spec, a, b, c)
 
     assert np.linalg.norm(out - ans) <= 1e-15
diff --git a/test/test_expression.py b/test/test_expression.py
index 1b973e9a6..aa60f8a53 100644
--- a/test/test_expression.py
+++ b/test/test_expression.py
@@ -27,8 +27,8 @@
 import pytest
 
 import pyopencl as cl
-import pyopencl.clmath  # noqa
-import pyopencl.clrandom  # noqa
+import pyopencl.clmath
+import pyopencl.clrandom
 from pymbolic.mapper.evaluator import EvaluationMapper
 
 import loopy as lp
@@ -371,9 +371,9 @@ def get_numpy_type(x):
         cl_ctx = ctx_factory()
         knl = lp.set_options(knl, write_code=True)
         with cl.CommandQueue(cl_ctx) as queue:
-            evt, lp_values = knl(queue, out_host=True)
+            _evt, lp_values = knl(queue, out_host=True)
     elif type(target) is lp.ExecutableCTarget:
-        evt, lp_values = knl()
+        _evt, lp_values = knl()
     else:
         raise NotImplementedError("unsupported target")
 
@@ -413,7 +413,7 @@ def test_sci_notation_literal(ctx_factory):
 
     set_kernel = lp.set_options(set_kernel, write_code=True)
 
-    evt, (out,) = set_kernel(queue)
+    _evt, (out,) = set_kernel(queue)
 
     assert (np.abs(out.get() - 1e-12) < 1e-20).all()
 
@@ -430,7 +430,7 @@ def test_indexof(ctx_factory):
 
     knl = lp.set_options(knl, write_code=True)
 
-    (evt, (out,)) = knl(queue)
+    (_evt, (out,)) = knl(queue)
     out = out.get()
 
     assert np.array_equal(out.ravel(order="C"), np.arange(25))
@@ -454,7 +454,7 @@ def test_indexof_vec(ctx_factory):
     knl = lp.tag_data_axes(knl, "out", "vec,c,c")
     knl = lp.set_options(knl, write_code=True)
 
-    (evt, (out,)) = knl(queue)
+    (_evt, (_out,)) = knl(queue)
     # out = out.get()
     # assert np.array_equal(out.ravel(order="C"), np.arange(25))
 
@@ -568,9 +568,9 @@ def test_complex_support(ctx_factory, target):
     if target == lp.PyOpenCLTarget:
         cl_ctx = ctx_factory()
         with cl.CommandQueue(cl_ctx) as queue:
-            evt, out = knl(queue, **kwargs)
+            _evt, out = knl(queue, **kwargs)
     elif target == lp.ExecutableCTarget:
-        evt, out = knl(**kwargs)
+        _evt, out = knl(**kwargs)
     else:
         raise NotImplementedError("unsupported target")
 
@@ -623,7 +623,7 @@ def test_bool_type_context(ctx_factory):
             lp.GlobalArg("k", dtype=np.bool_, shape=lp.auto),
         ])
 
-    evt, (out,) = knl(queue)
+    _evt, (out,) = knl(queue)
     assert out.get() == np.logical_and(7.0, 8.0)
 
 
@@ -638,7 +638,7 @@ def test_np_bool_handling(ctx_factory):
         "{:}",
         [lp.Assignment(parse("y"), p.LogicalNot(np.bool_(False)))],
         [lp.GlobalArg("y", dtype=np.bool_, shape=lp.auto)])
-    evt, (out,) = knl(queue)
+    _evt, (out,) = knl(queue)
     assert out.get().item() is True
 
 
@@ -692,10 +692,10 @@ def test_complex_functions_with_real_args(ctx_factory, target):
     if target == lp.PyOpenCLTarget:
         cl_ctx = ctx_factory()
         with cl.CommandQueue(cl_ctx) as queue:
-            evt, out = t_unit(queue, c64=c64, c128=c128, f32=f32, f64=f64)
+            _evt, out = t_unit(queue, c64=c64, c128=c128, f32=f32, f64=f64)
     elif target == lp.ExecutableCTarget:
         t_unit = lp.set_options(t_unit, build_options=["-Werror"])
-        evt, out = t_unit(c64=c64, c128=c128, f32=f32, f64=f64)
+        _evt, out = t_unit(c64=c64, c128=c128, f32=f32, f64=f64)
     else:
         raise NotImplementedError("unsupported target")
 
diff --git a/test/test_fortran.py b/test/test_fortran.py
index 8f1291bba..7a5d73a95 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -28,7 +28,7 @@
 import pytest
 
 import pyopencl as cl
-import pyopencl.clrandom  # noqa
+import pyopencl.clrandom
 
 import loopy as lp
 
@@ -136,7 +136,7 @@ def test_assign_single_precision_scalar(ctx_factory):
     t_unit = lp.parse_fortran(fortran_src)
 
     import re
-    assert re.search("1.1000000[0-9]*f", lp.generate_code_v2(t_unit).device_code())
+    assert re.search(r"1.1000000[0-9]*f", lp.generate_code_v2(t_unit).device_code())
 
     a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F")
     t_unit(queue, a=a_dev)
@@ -176,7 +176,7 @@ def test_fill(ctx_factory):
 
     ctx = ctx_factory()
 
-    lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5, a=5))
+    lp.auto_test_vs_ref(knl, ctx, knl, parameters={"n": 5, "a": 5})
 
 
 def test_fill_const(ctx_factory):
@@ -197,7 +197,7 @@ def test_fill_const(ctx_factory):
 
     ctx = ctx_factory()
 
-    lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5, a=5))
+    lp.auto_test_vs_ref(knl, ctx, knl, parameters={"n": 5, "a": 5})
 
 
 def test_asterisk_in_shape(ctx_factory):
@@ -247,7 +247,7 @@ def test_assignment_to_subst(ctx_factory):
     knl = lp.assignment_to_subst(knl, "a", "i")
 
     ctx = ctx_factory()
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 5})
 
 
 def test_assignment_to_subst_two_defs(ctx_factory):
@@ -274,7 +274,7 @@ def test_assignment_to_subst_two_defs(ctx_factory):
     knl = lp.assignment_to_subst(knl, "a")
 
     ctx = ctx_factory()
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 5})
 
 
 def test_assignment_to_subst_indices(ctx_factory):
@@ -339,7 +339,7 @@ def test_if(ctx_factory):
     knl = lp.assignment_to_subst(knl, "a")
 
     ctx = ctx_factory()
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 5})
 
 
 def test_tagged(ctx_factory):
@@ -427,7 +427,8 @@ def test_matmul(ctx_factory, buffer_inames):
     prog = lp.buffer_array(prog, "c", buffer_inames=buffer_inames,
             init_expression="0", store_expression="base+buffer")
 
-    lp.auto_test_vs_ref(ref_prog, ctx, prog, parameters=dict(n=128, m=128, ell=128))
+    lp.auto_test_vs_ref(ref_prog, ctx, prog,
+                        parameters={"n": 128, "m": 128, "ell": 128})
 
 
 @pytest.mark.xfail
@@ -524,7 +525,7 @@ def test_fuse_kernels(ctx_factory):
     assert len(knl["xderiv_and_yderiv"].temporary_variables) == 2
 
     ctx = ctx_factory()
-    lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
+    lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters={"nelements": 20, "ndofs": 4})
 
 
 def test_parse_and_fuse_two_kernels():
@@ -610,7 +611,7 @@ def test_precompute_some_exist(ctx_factory):
     ref_knl = knl
 
     ctx = ctx_factory()
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 128, "m": 128, "ell": 128})
 
 
 def test_fortran_subroutines():
@@ -679,7 +680,7 @@ def test_division_in_shapes(ctx_factory):
     print(t_unit)
 
     ctx = ctx_factory()
-    lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit, parameters=dict(m=128))
+    lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit, parameters={"m": 128})
 
 
 if __name__ == "__main__":
diff --git a/test/test_fusion.py b/test/test_fusion.py
index 66daf9725..ab3b4e74a 100644
--- a/test/test_fusion.py
+++ b/test/test_fusion.py
@@ -52,7 +52,7 @@ def test_two_kernel_fusion(ctx_factory):
         """
     )
     knl = lp.fuse_kernels([knla, knlb], data_flow=[("out", 0, 1)])
-    evt, (out,) = knl(queue)
+    _evt, (out,) = knl(queue)
     np.testing.assert_allclose(out.get(), np.arange(100, 110))
 
 
@@ -163,7 +163,7 @@ def write_into_mat_prg():
         bidirectional=True,
         force=True
     )
-    evt, result = fused_knl(queue, **kwargs)
+    _evt, result = fused_knl(queue, **kwargs)
     result = result["result"]
     np.testing.assert_allclose(result, answer)
 
diff --git a/test/test_isl.py b/test/test_isl.py
index fc1312f7c..125fae143 100644
--- a/test/test_isl.py
+++ b/test/test_isl.py
@@ -26,7 +26,6 @@
 def test_aff_to_expr():
     s = isl.Space.create_from_names(isl.Context(), ["a", "b"])
     zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(s))
-    one = zero.set_constant_val(1)  # noqa
     a = zero.set_coefficient_val(isl.dim_type.in_, 0, 1)
     b = zero.set_coefficient_val(isl.dim_type.in_, 1, 1)
 
diff --git a/test/test_linalg.py b/test/test_linalg.py
index 99273ae9f..ebd2e96bd 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -644,7 +644,7 @@ def test_fancy_matrix_mul(ctx_factory):
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
             op_count=[2*n**3/1e9], op_label=["GFlops"],
-            parameters=dict(n=n))
+            parameters={"n": n})
 
 
 def test_small_batched_matvec(ctx_factory):
@@ -678,7 +678,7 @@ def test_small_batched_matvec(ctx_factory):
 
     lp.auto_test_vs_ref(seq_knl, ctx, knl,
             op_count=[K*2*Np**2/1e9], op_label=["GFlops"],
-            parameters=dict(K=K))
+            parameters={"K": K})
 
 
 if __name__ == "__main__":
diff --git a/test/test_loopy.py b/test/test_loopy.py
index bfa607328..d9d168181 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -27,9 +27,9 @@
 import pytest
 
 import pyopencl as cl
-import pyopencl.array  # noqa
-import pyopencl.clmath  # noqa
-import pyopencl.clrandom  # noqa
+import pyopencl.array
+import pyopencl.clmath
+import pyopencl.clrandom
 
 import loopy as lp
 
@@ -78,7 +78,7 @@ def test_globals_decl_once_with_multi_subprogram(ctx_factory):
 
     knl = lp.split_iname(knl, "i", 2, outer_tag="g.0", inner_tag="l.0")
     knl = lp.split_iname(knl, "ii", 2, outer_tag="g.0", inner_tag="l.0")
-    evt, (out,) = knl(queue, a=a)
+    _evt, (out,) = knl(queue, a=a)
     assert np.linalg.norm(out-(2*(a+cnst)+cnst)) <= 1e-15
 
 
@@ -175,7 +175,7 @@ def test_sized_and_complex_literals(ctx_factory):
                 ],
             assumptions="n>=1")
 
-    lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5))
+    lp.auto_test_vs_ref(knl, ctx, knl, parameters={"n": 5})
 
 
 def test_simple_side_effect():
@@ -202,7 +202,7 @@ def test_owed_barriers():
             target=lp.PyOpenCLTarget()
             )
 
-    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.tag_inames(knl, {"i": "l.0"})
 
     print(knl)
     print(lp.generate_code_v2(knl))
@@ -243,7 +243,7 @@ def test_bare_data_dependency(ctx_factory):
                 ])
 
     n = 20000
-    evt, (a,) = knl(queue, n=n, out_host=True)
+    _evt, (a,) = knl(queue, n=n, out_host=True)
 
     assert a.shape == (n,)
     assert (a == 1).all()
@@ -265,7 +265,7 @@ def test_ilp_write_race_detection_global():
             target=lp.PyOpenCLTarget(),
             name="loopy_kernel")
 
-    knl = lp.tag_inames(knl, dict(j="ilp"))
+    knl = lp.tag_inames(knl, {"j": "ilp"})
 
     knl = lp.preprocess_kernel(knl)
 
@@ -291,7 +291,7 @@ def test_ilp_write_race_avoidance_local():
             target=lp.PyOpenCLTarget(),
             name="loopy_kernel")
 
-    knl = lp.tag_inames(knl, dict(i="l.0", j="ilp"))
+    knl = lp.tag_inames(knl, {"i": "l.0", "j": "ilp"})
 
     knl = lp.preprocess_kernel(knl)
     assert knl["loopy_kernel"].temporary_variables["a"].shape == (16, 17)
@@ -307,7 +307,7 @@ def test_ilp_write_race_avoidance_private():
             target=lp.PyOpenCLTarget(),
             name="loopy_kernel")
 
-    knl = lp.tag_inames(knl, dict(j="ilp"))
+    knl = lp.tag_inames(knl, {"j": "ilp"})
 
     knl = lp.preprocess_kernel(knl)
     assert knl["loopy_kernel"].temporary_variables["a"].shape == (16,)
@@ -409,7 +409,7 @@ def test_unknown_arg_shape():
         target=lp.PyOpenCLTarget(),
         assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0]))
 
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
     print(lp.generate_code_v2(knl).device_code())
 
 # }}}
@@ -539,12 +539,12 @@ def test_dependent_domain_insn_iname_finding():
     assert "isrc_box" in prog["loopy_kernel"].insn_inames("set_strength")
 
     prog = lp.add_dtypes(prog,
-        dict(
-            source_boxes=np.int32,
-            box_source_starts=np.int32,
-            box_source_counts_nonchild=np.int32,
-            strengths=np.float64,
-            nsources=np.int32))
+        {
+            "source_boxes": np.int32,
+            "box_source_starts": np.int32,
+            "box_source_counts_nonchild": np.int32,
+            "strengths": np.float64,
+            "nsources": np.int32})
     print(lp.generate_code_v2(prog).device_code())
 
 
@@ -601,14 +601,14 @@ def test_vector_types(ctx_factory, vec_len):
     ref_knl = knl
 
     knl = lp.tag_array_axes(knl, "out", "c,vec")
-    knl = lp.tag_inames(knl, dict(j="unr"))
+    knl = lp.tag_inames(knl, {"j": "unr"})
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
-            parameters=dict(
-                n=20000
-                ))
+            parameters={
+                "n": 20000
+                })
 
 
 def test_conditional(ctx_factory):
@@ -633,9 +633,9 @@ def test_conditional(ctx_factory):
     ref_knl = knl
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
-            parameters=dict(
-                n=200
-                ))
+            parameters={
+                "n": 200
+                })
 
 
 def test_conditional_two_ways(ctx_factory):
@@ -677,9 +677,9 @@ def test_conditional_two_ways(ctx_factory):
     ref_knl = knl
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
-            parameters=dict(
-                n=200
-                ))
+            parameters={
+                "n": 200
+                })
 
 
 def test_ilp_loop_bound(ctx_factory):
@@ -706,9 +706,9 @@ def test_ilp_loop_bound(ctx_factory):
     knl = lp.split_iname(knl,  "k", 4, inner_tag="ilp")
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
-            parameters=dict(
-                n=200
-                ))
+            parameters={
+                "n": 200
+                })
 
 
 def test_arg_shape_uses_assumptions(ctx_factory):
@@ -771,7 +771,7 @@ def test_multiple_writes_to_local_temporary():
         <> temp[i, 0] = 17
         temp[i, 1] = 15
         """)
-    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.tag_inames(knl, {"i": "l.0"})
     print(lp.generate_code_v2(knl).device_code())
 
 
@@ -788,12 +788,12 @@ def test_make_copy_kernel(ctx_factory):
     cknl1 = lp.fix_parameters(cknl1, n2=3)
 
     cknl1 = lp.set_options(cknl1, write_code=True)
-    evt, a2 = cknl1(queue, input=a1)
+    _evt, a2 = cknl1(queue, input=a1)
 
     cknl2 = lp.make_copy_kernel("c,c,c", intermediate_format)
     cknl2 = lp.fix_parameters(cknl2, n2=3)
 
-    evt, a3 = cknl2(queue, input=a2)
+    _evt, a3 = cknl2(queue, input=a2)
 
     assert (a1 == a3).all()
 
@@ -810,7 +810,7 @@ def test_make_copy_kernel_with_offsets(ctx_factory):
     cknl1 = lp.fix_parameters(cknl1, n0=3)
 
     cknl1 = lp.set_options(cknl1, write_code=True)
-    evt, (a2_dev,) = cknl1(queue, input=a1_dev)
+    _evt, (a2_dev,) = cknl1(queue, input=a1_dev)
 
     assert (a1 == a2_dev.get()).all()
 
@@ -830,14 +830,14 @@ def test_auto_test_can_detect_problems(ctx_factory):
         a[i,i] = 25
         """)
 
-    ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(a=np.float32))
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+    ref_knl = lp.add_and_infer_dtypes(ref_knl, {"a": np.float32})
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
 
     from loopy.diagnostic import AutomaticTestFailure
     with pytest.raises(AutomaticTestFailure):
         lp.auto_test_vs_ref(
                 ref_knl, ctx, knl,
-                parameters=dict(n=123))
+                parameters={"n": 123})
 
 
 def test_auto_test_zero_warmup_rounds(ctx_factory):
@@ -849,11 +849,11 @@ def test_auto_test_zero_warmup_rounds(ctx_factory):
         a[i,j] = 25
         """)
 
-    ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(a=np.float32))
+    ref_knl = lp.add_and_infer_dtypes(ref_knl, {"a": np.float32})
 
     lp.auto_test_vs_ref(
             ref_knl, ctx, ref_knl,
-            parameters=dict(n=12),
+            parameters={"n": 12},
             warmup_rounds=0)
 
 
@@ -894,7 +894,7 @@ def test_atomic(ctx_factory, dtype):
     ref_knl = knl
     knl = lp.split_iname(knl, "i", 512)
     knl = lp.split_iname(knl, "i_inner", 128, outer_tag="unr", inner_tag="g.0")
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 10000})
 
 
 @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
@@ -1001,7 +1001,7 @@ def test_literal_local_barrier(ctx_factory):
 
     ref_knl = knl
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 5})
 
 
 def test_local_barrier_mem_kind():
@@ -1056,7 +1056,7 @@ def test_kernel_splitting(ctx_factory):
     print(cgr.device_code())
     print(cgr.host_code())
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 5})
 
 
 def test_kernel_splitting_with_loop(ctx_factory):
@@ -1090,7 +1090,7 @@ def test_kernel_splitting_with_loop(ctx_factory):
     print(cgr.device_code())
     print(cgr.host_code())
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 5})
 
 
 def save_and_reload_temporaries_test(queue, prog, out_expect, debug=False):
@@ -1126,7 +1126,7 @@ def test_save_of_private_scalar(ctx_factory, hw_loop, debug=False):
         """, seq_dependencies=True)
 
     if hw_loop:
-        prog = lp.tag_inames(prog, dict(i="g.0"))
+        prog = lp.tag_inames(prog, {"i": "g.0"})
 
     save_and_reload_temporaries_test(queue, prog, np.arange(8), debug)
 
@@ -1167,7 +1167,7 @@ def test_save_of_private_array_in_hw_loop(ctx_factory, debug=False):
         end
         """, seq_dependencies=True)
 
-    knl = lp.tag_inames(knl, dict(i="g.0"))
+    knl = lp.tag_inames(knl, {"i": "g.0"})
     knl = lp.set_temporary_address_space(knl, "t", "private")
 
     save_and_reload_temporaries_test(
@@ -1217,7 +1217,7 @@ def test_save_of_private_multidim_array_in_hw_loop(ctx_factory, debug=False):
         """, seq_dependencies=True)
 
     knl = lp.set_temporary_address_space(knl, "t", "private")
-    knl = lp.tag_inames(knl, dict(i="g.0"))
+    knl = lp.tag_inames(knl, {"i": "g.0"})
 
     result = np.array([np.vstack(8 * (np.arange(8),)) for i in range(8)])
     save_and_reload_temporaries_test(queue, knl, result, debug)
@@ -1250,7 +1250,7 @@ def test_save_of_multiple_private_temporaries(ctx_factory, hw_loop, debug=False)
 
     knl = lp.set_temporary_address_space(knl, "t_arr", "private")
     if hw_loop:
-        knl = lp.tag_inames(knl, dict(i="g.0"))
+        knl = lp.tag_inames(knl, {"i": "g.0"})
 
     result = np.array([1, 10, 10, 10, 10, 10, 10, 10, 10, 9])
 
@@ -1273,7 +1273,7 @@ def test_save_of_local_array(ctx_factory, debug=False):
         """, seq_dependencies=True)
 
     knl = lp.set_temporary_address_space(knl, "t", "local")
-    knl = lp.tag_inames(knl, dict(i="g.0", j="l.0"))
+    knl = lp.tag_inames(knl, {"i": "g.0", "j": "l.0"})
 
     save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
 
@@ -1295,7 +1295,7 @@ def test_save_of_local_array_with_explicit_local_barrier(ctx_factory, debug=Fals
         """, seq_dependencies=True)
 
     knl = lp.set_temporary_address_space(knl, "t", "local")
-    knl = lp.tag_inames(knl, dict(i="g.0", j="l.0"))
+    knl = lp.tag_inames(knl, {"i": "g.0", "j": "l.0"})
 
     save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
 
@@ -1316,7 +1316,7 @@ def test_save_local_multidim_array(ctx_factory, debug=False):
             """, seq_dependencies=True)
 
     knl = lp.set_temporary_address_space(knl, "t_local", "local")
-    knl = lp.tag_inames(knl, dict(j="l.0", i="g.0"))
+    knl = lp.tag_inames(knl, {"j": "l.0", "i": "g.0"})
 
     save_and_reload_temporaries_test(queue, knl, 1, debug)
 
@@ -1336,7 +1336,7 @@ def test_save_with_base_storage(ctx_factory, debug=False):
             "...",
             seq_dependencies=True)
 
-    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.tag_inames(knl, {"i": "l.0"})
     knl = lp.set_temporary_address_space(knl, "a", "local")
     knl = lp.set_temporary_address_space(knl, "b", "local")
 
@@ -1359,7 +1359,7 @@ def test_save_ambiguous_storage_requirements():
             """,
             seq_dependencies=True)
 
-    knl = lp.tag_inames(knl, dict(i="g.0", j="l.0"))
+    knl = lp.tag_inames(knl, {"i": "g.0", "j": "l.0"})
     knl = lp.duplicate_inames(knl, "j", within="writes:out", tags={"j": "l.0"})
     knl = lp.set_temporary_address_space(knl, "a", "local")
 
@@ -1382,7 +1382,7 @@ def test_save_across_inames_with_same_tag(ctx_factory, debug=False):
             "...",
             seq_dependencies=True)
 
-    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.tag_inames(knl, {"i": "l.0"})
     knl = lp.duplicate_inames(knl, "i", within="reads:a", tags={"i": "l.0"})
 
     save_and_reload_temporaries_test(queue, knl, np.arange(10), debug)
@@ -1453,7 +1453,7 @@ def test_global_temporary(ctx_factory):
     print(cgr.device_code())
     # print(cgr.host_code())
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 5})
 
 
 def test_assign_to_linear_subscript(ctx_factory):
@@ -1508,7 +1508,7 @@ def test_finite_difference_expr_subst(ctx_factory):
                 ])
 
     fused_knl = lp.set_options(fused_knl, write_code=True)
-    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
+    _evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
 
     fused_knl = lp.assignment_to_subst(fused_knl, "f")
 
@@ -1517,7 +1517,7 @@ def test_finite_difference_expr_subst(ctx_factory):
     # This is the real test here: The automatically generated
     # shape expressions are '2+n' and the ones above are 'n+2'.
     # Is loopy smart enough to understand that these are equal?
-    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
+    _evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))
 
     fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")
 
@@ -1530,7 +1530,7 @@ def test_finite_difference_expr_subst(ctx_factory):
 
     precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
     precomp_knl = lp.set_options(precomp_knl, return_dict=True)
-    evt, _ = precomp_knl(queue, u=u, h=h)
+    _evt, _ = precomp_knl(queue, u=u, h=h)
 
 
 # {{{ call without returned values
@@ -1549,7 +1549,7 @@ def test_call_with_no_returned_value(ctx_factory):
     from library_for_test import NoRetFunction
     knl = lp.register_callable(knl, "f", NoRetFunction("f"))
 
-    evt, _ = knl(queue)
+    _evt, _ = knl(queue)
 
 # }}}
 
@@ -1644,7 +1644,7 @@ def test_sequential_dependencies(ctx_factory):
 
     print(prog["loopy_kernel"].stringify(with_dependencies=True))
 
-    lp.auto_test_vs_ref(prog, ctx, prog, parameters=dict(n=5))
+    lp.auto_test_vs_ref(prog, ctx, prog, parameters={"n": 5})
 
 
 def test_nop(ctx_factory):
@@ -1666,7 +1666,7 @@ def test_nop(ctx_factory):
     knl = lp.fix_parameters(knl, n=15)
     knl = lp.add_and_infer_dtypes(knl, {"z": np.float64})
 
-    lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(ntrips=5))
+    lp.auto_test_vs_ref(knl, ctx, knl, parameters={"ntrips": 5})
 
 
 def test_global_barrier(ctx_factory):
@@ -1708,7 +1708,7 @@ def test_global_barrier(ctx_factory):
 
     print(knl)
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(ntrips=5, n=10))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"ntrips": 5, "n": 10})
 
 
 def test_missing_global_barrier():
@@ -1842,7 +1842,7 @@ def test_temp_initializer(ctx_factory, src_order, tmp_order):
     knl = lp.set_options(knl, write_code=True)
     knl = lp.fix_parameters(knl, n=a.shape[0])
 
-    evt, (a2,) = knl(queue, out_host=True)
+    _evt, (a2,) = knl(queue, out_host=True)
 
     assert np.array_equal(a, a2)
 
@@ -1939,7 +1939,7 @@ def test_if_else(ctx_factory):
             """
             )
 
-    evt, (out,) = knl(queue, out_host=True)
+    _evt, (out,) = knl(queue, out_host=True)
 
     out_ref = np.empty(50)
     out_ref[::3] = 15
@@ -1967,7 +1967,7 @@ def test_if_else(ctx_factory):
             """
             )
 
-    evt, (out,) = knl(queue, out_host=True)
+    _evt, (out,) = knl(queue, out_host=True)
 
     out_ref = np.zeros(50)
     out_ref[1::2] = 4
@@ -2000,7 +2000,7 @@ def test_if_else(ctx_factory):
             """
             )
 
-    evt, (out,) = knl(queue, out_host=True)
+    _evt, (out,) = knl(queue, out_host=True)
 
     out_ref = np.zeros((50, 50))
     out_ref[:25, 0::2] = 1
@@ -2036,7 +2036,7 @@ def test_tight_loop_bounds(ctx_factory):
 
     knl = lp.set_options(knl, write_code=True)
 
-    evt, (out,) = knl(queue, out_host=True)
+    _evt, (out,) = knl(queue, out_host=True)
 
     assert (out == np.arange(10)).all()
 
@@ -2247,7 +2247,7 @@ def test_barrier_insertion_near_top_of_loop():
         """,
         seq_dependencies=True)
 
-    prog = lp.tag_inames(prog, dict(i="l.0"))
+    prog = lp.tag_inames(prog, {"i": "l.0"})
     prog = lp.set_temporary_address_space(prog, "a", "local")
     prog = lp.set_temporary_address_space(prog, "b", "local")
     prog = lp.preprocess_kernel(prog)
@@ -2273,7 +2273,7 @@ def test_barrier_insertion_near_bottom_of_loop():
         end
         """,
         seq_dependencies=True)
-    prog = lp.tag_inames(prog, dict(i="l.0"))
+    prog = lp.tag_inames(prog, {"i": "l.0"})
     prog = lp.set_temporary_address_space(prog, "a", "local")
     prog = lp.set_temporary_address_space(prog, "b", "local")
     prog = lp.preprocess_kernel(prog)
@@ -2449,7 +2449,7 @@ def test_inames_conditional_generation(ctx_factory):
             "...",
             seq_dependencies=True)
 
-    knl = lp.tag_inames(knl, dict(i="g.0"))
+    knl = lp.tag_inames(knl, {"i": "g.0"})
 
     with cl.CommandQueue(ctx) as queue:
         knl(queue)
@@ -2465,7 +2465,7 @@ def test_fixed_parameters(ctx_factory):
             <>tmp[i] = i  {id=init}
             tmp[0] = 0  {dep=init}
             """,
-            fixed_parameters=dict(n=1))
+            fixed_parameters={"n": 1})
 
     knl(queue)
 
@@ -2485,7 +2485,7 @@ def test_execution_backend_can_cache_dtypes(ctx_factory):
     queue = cl.CommandQueue(ctx)
 
     knl = lp.make_kernel("{[i]: 0 <= i < 10}", "<>tmp[i] = i")
-    knl = lp.add_dtypes(knl, dict(tmp=int))
+    knl = lp.add_dtypes(knl, {"tmp": int})
 
     knl(queue)
 
@@ -2542,7 +2542,7 @@ def test_relaxed_stride_checks(ctx_factory):
         mat = np.zeros((1, 10), order="F")
         b = np.zeros(10)
 
-        evt, (a,) = knl(queue, A=mat, b=b)
+        _evt, (a,) = knl(queue, A=mat, b=b)
 
         assert a == 0
 
@@ -2720,7 +2720,7 @@ def test_dump_binary(ctx_factory):
     ref_knl = knl
 
     lp.auto_test_vs_ref(
-            ref_knl, ctx, knl, parameters=dict(n=5),
+            ref_knl, ctx, knl, parameters={"n": 5},
             dump_binary=True)
 
 
@@ -2964,7 +2964,7 @@ def test_split_iname_within(ctx_factory):
         x[i, j] = 3 {id=a}
         y[i, j] = 2 * y[i, j] {id=b}
         """,
-        options=dict(write_code=True))
+        options={"write_code": True})
 
     ref_knl = knl
 
@@ -2975,7 +2975,7 @@ def test_split_iname_within(ctx_factory):
                          outer_tag="g.0", inner_tag="l.0",
                          within="id:b")
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 5})
 
 
 @pytest.mark.parametrize("base_type,exp_type", [
@@ -3020,7 +3020,7 @@ def _make_random_np_array(shape, dtype):
 
     knl = lp.add_dtypes(knl, {"base": base_type, "power": exp_type})
 
-    evt, (result,) = knl(queue, base=base, power=power)
+    _evt, (result,) = knl(queue, base=base, power=power)
 
     assert result.dtype == expected_result.dtype
 
@@ -3070,7 +3070,7 @@ def test_scalar_temporary(ctx_factory):
         lp.TemporaryVariable("tmp", address_space=lp.AddressSpace.GLOBAL,
                              shape=lp.auto),
         ...])
-    evt, (out, ) = knl(queue, x=x_in)
+    _evt, (out, ) = knl(queue, x=x_in)
     np.testing.assert_allclose(4*x_in, out.get())
 
 
@@ -3255,7 +3255,7 @@ def test_zero_stride_array(ctx_factory):
         y[i, j] = 1
         """, [lp.GlobalArg("y", shape=(10, 0))])
 
-    evt, (out,) = knl(cq)
+    _evt, (out,) = knl(cq)
     assert out.shape == (10, 0)
 
 
@@ -3271,13 +3271,13 @@ def test_sep_array_ordering(ctx_factory):
         """
         x[k, i] = k
         """,
-        [lp.GlobalArg("x", shape=("noutputs", "m"), dim_tags="sep,C")] + [...],
-        fixed_parameters=dict(noutputs=n),
+        [lp.GlobalArg("x", shape=("noutputs", "m"), dim_tags="sep,C"), ...],
+        fixed_parameters={"noutputs": n},
         )
     knl = lp.tag_inames(knl, "k:unr")
 
     x = [cl.array.empty(cq, (0,), dtype=np.float64) for i in range(n)]
-    evt, out = knl(cq, x=x)
+    _evt, out = knl(cq, x=x)
 
     for i in range(n):
         assert out[i] is x[i], f"failed on input x{i}: {id(out[i])} {id(x[i])}"
@@ -3532,7 +3532,7 @@ def test_type_inference_of_clbls_in_substitutions(ctx_factory):
         y[i] = subst_0(i)
         """)
 
-    evt, (out,) = knl(cq)
+    _evt, (out,) = knl(cq)
     np.testing.assert_allclose(out.get(), np.abs(10.0*(np.arange(10)-5)))
 
 
@@ -3677,8 +3677,8 @@ def test_no_unnecessary_lbarrier(ctx_factory):
         """,
         assumptions="n>=0")
 
-    t_unit = lp.add_dtypes(t_unit, dict(ai=np.float32))
-    t_unit = lp.tag_inames(t_unit, dict(i_inner="l.0", i_outer="g.0"))
+    t_unit = lp.add_dtypes(t_unit, {"ai": np.float32})
+    t_unit = lp.tag_inames(t_unit, {"i_inner": "l.0", "i_outer": "g.0"})
     t_unit = lp.set_temporary_address_space(t_unit, "s_a", "local")
     t_unit = lp.prioritize_loops(t_unit, "i_outer,i_inner")
 
@@ -3704,6 +3704,21 @@ def test_long_kernel():
     lp.get_one_linearized_kernel(t_unit.default_entrypoint, t_unit.callables_table)
 
 
+@pytest.mark.filterwarnings("error:.*:loopy.LoopyWarning")
+def test_loop_imperfect_nest_priorities_in_v2_scheduler():
+    # Reported by Connor Ward. See <https://github.com/inducer/loopy/issues/890>.
+    knl = lp.make_kernel(
+        "{ [i,j,k]: 0 <= i,j,k < 5}",
+        """
+        x[i, j] = i + j
+        y[i, k] = i + k
+        """,
+        loop_priority=frozenset({("i", "j"), ("i", "k")}),
+    )
+
+    lp.generate_code_v2(knl)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_nbody.py b/test/test_nbody.py
index e258d801e..02dcb1743 100644
--- a/test/test_nbody.py
+++ b/test/test_nbody.py
@@ -81,7 +81,7 @@ def variant_gpu(knl):
         knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"],
                 ["x_fetch_j", "x_fetch_k"],
                 fetch_outer_inames="i_outer, j_outer", default_tag=None)
-        knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0"))
+        knl = lp.tag_inames(knl, {"x_fetch_k": "unr", "x_fetch_j": "l.0"})
         knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None)
         knl = lp.prioritize_loops(knl, ["j_outer", "j_inner"])
         return knl
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 55ff270a2..92020b73a 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -84,7 +84,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
 
     hsv = lp.fix_parameters(hsv, Nq=Nq)
     hsv = lp.prioritize_loops(hsv, "e,k,j,i")
-    hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0"))
+    hsv = lp.tag_inames(hsv, {"e": "g.0", "j": "l.1", "i": "l.0"})
     hsv = lp.assume(hsv, "elements >= 1")
 
     hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1)
@@ -168,7 +168,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
             hsv = lp.rename_iname(hsv, "n", n_iname, within="id:"+reader.id,
                   existing_ok=True)
 
-    hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1"))
+    hsv = lp.tag_inames(hsv, {"ii": "l.0", "jj": "l.1"})
     for iname in flux_ilp_inames:
         hsv = lp.tag_inames(hsv, {iname: "ilp"})
 
@@ -193,9 +193,9 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
 
     if opt_level == 4:
         tap_hsv = hsv
-        tap_hsv = lp.tag_inames(tap_hsv, dict(
-              Q_dim_field_inner="unr",
-              Q_dim_field_outer="unr"))
+        tap_hsv = lp.tag_inames(tap_hsv, {
+              "Q_dim_field_inner": "unr",
+              "Q_dim_field_outer": "unr"})
 
     hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames,
           fetch_bounding_box=True, default_tag="for",
@@ -203,11 +203,11 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
 
     if opt_level == 5:
         tap_hsv = hsv
-        tap_hsv = lp.tag_inames(tap_hsv, dict(
-              rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr",
-              rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr",
-              Q_dim_field_inner="unr",
-              Q_dim_field_outer="unr"))
+        tap_hsv = lp.tag_inames(tap_hsv, {
+              "rhsQ_init_field_inner": "unr", "rhsQ_store_field_inner": "unr",
+              "rhsQ_init_field_outer": "unr", "rhsQ_store_field_outer": "unr",
+              "Q_dim_field_inner": "unr",
+              "Q_dim_field_outer": "unr"})
 
     # buffer axes need to be vectorized in order for this to work
     hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c")
@@ -219,17 +219,17 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
 
     if opt_level == 6:
         tap_hsv = hsv
-        tap_hsv = lp.tag_inames(tap_hsv, dict(
-              rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr",
-              rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr",
-              Q_dim_field_inner="unr",
-              Q_dim_field_outer="unr"))
-
-    hsv = lp.tag_inames(hsv, dict(
-          rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec",
-          rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr",
-          Q_dim_field_inner="vec",
-          Q_dim_field_outer="unr"))
+        tap_hsv = lp.tag_inames(tap_hsv, {
+              "rhsQ_init_field_inner": "unr", "rhsQ_store_field_inner": "unr",
+              "rhsQ_init_field_outer": "unr", "rhsQ_store_field_outer": "unr",
+              "Q_dim_field_inner": "unr",
+              "Q_dim_field_outer": "unr"})
+
+    hsv = lp.tag_inames(hsv, {
+          "rhsQ_init_field_inner": "vec", "rhsQ_store_field_inner": "vec",
+          "rhsQ_init_field_outer": "unr", "rhsQ_store_field_outer": "unr",
+          "Q_dim_field_inner": "vec",
+          "Q_dim_field_outer": "unr"})
 
     if opt_level == 7:
         tap_hsv = hsv
@@ -266,7 +266,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
     # add a simple transformation for it
     # hsv = hsv.copy(name="horizontalStrongVolumeKernel")
 
-    results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300),
+    results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters={"elements": 300},
             quiet=True)
 
     elapsed = results["elapsed_wall"]
diff --git a/test/test_reduction.py b/test/test_reduction.py
index 0ca1a2650..125d247d9 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -27,8 +27,8 @@
 import pytest
 
 import pyopencl as cl
-import pyopencl.clmath  # noqa
-import pyopencl.clrandom  # noqa
+import pyopencl.clmath
+import pyopencl.clrandom
 import pyopencl.version
 
 import loopy as lp
@@ -86,7 +86,7 @@ def test_empty_reduction(ctx_factory):
     print(knl)
 
     knl = lp.set_options(knl, write_code=True)
-    evt, (a,) = knl(queue)
+    _evt, (a,) = knl(queue)
 
     assert (a.get() == 0).all()
 
@@ -113,7 +113,7 @@ def test_nested_dependent_reduction(ctx_factory):
 
     n = 330
     ell = np.arange(n, dtype=np.int32)
-    evt, (a,) = knl(queue, ell=ell, n=n, out_host=True)
+    _evt, (a,) = knl(queue, ell=ell, n=n, out_host=True)
 
     tgt_result = (2*ell-1)*2*ell/2
     assert (a == tgt_result).all()
@@ -314,7 +314,7 @@ def test_argmax(ctx_factory):
     knl = lp.set_options(knl, write_code=True, allow_terminal_colors=True)
 
     a = np.random.randn(10000).astype(dtype)
-    evt, (max_idx, max_val) = knl(queue, a=a, out_host=True)
+    _evt, (max_idx, max_val) = knl(queue, a=a, out_host=True)
     assert max_val == np.max(np.abs(a))
     assert max_idx == np.where(np.abs(a) == max_val)[-1]
 
@@ -333,7 +333,7 @@ def test_simul_reduce(ctx_factory):
                 ],
             assumptions="n>=1")
 
-    evt, (a, b) = knl(queue, n=n)
+    _evt, (a, b) = knl(queue, n=n)
 
     ref = sum(i*j for i in range(n) for j in range(n))
     assert a.get() == ref
@@ -358,7 +358,7 @@ def test_reduction_library(ctx_factory, op_name, np_op):
             assumptions="n>=1")
 
     a = np.random.randn(20, 10)
-    evt, (res,) = knl(queue, a=a)
+    _evt, (res,) = knl(queue, a=a)
 
     assert np.allclose(res, np_op(a, axis=1))
 
@@ -395,7 +395,7 @@ def test_double_sum_made_unique(ctx_factory):
     knl = lp.make_reduction_inames_unique(knl)
     print(knl)
 
-    evt, (a, b) = knl(queue, n=n)
+    _evt, (a, b) = knl(queue, n=n)
 
     ref = sum(i*j for i in range(n) for j in range(n))
     assert a.get() == ref
@@ -408,14 +408,14 @@ def test_parallel_multi_output_reduction(ctx_factory):
                 """
                 max_val, max_indices = argmax(i, abs(a[i]), i)
                 """)
-    knl = lp.tag_inames(knl, dict(i="l.0"))
-    knl = lp.add_dtypes(knl, dict(a=np.float64))
+    knl = lp.tag_inames(knl, {"i": "l.0"})
+    knl = lp.add_dtypes(knl, {"a": np.float64})
 
     ctx = ctx_factory()
 
     with cl.CommandQueue(ctx) as queue:
         a = np.random.rand(128)
-        out, (max_index, max_val) = knl(queue, a=a)
+        _out, (max_index, max_val) = knl(queue, a=a)
 
         assert max_val == np.max(a)
         assert max_index == np.argmax(np.abs(a))
@@ -497,7 +497,7 @@ def test_reduction_in_conditional(ctx_factory):
 
     knl = lp.preprocess_program(knl)
 
-    evt, (out,) = knl(cq)
+    _evt, (out,) = knl(cq)
 
     assert (out == 45).all()
 
diff --git a/test/test_scan.py b/test/test_scan.py
index 986a30daa..47c2e04b4 100644
--- a/test/test_scan.py
+++ b/test/test_scan.py
@@ -30,8 +30,8 @@
 import pytest
 
 import pyopencl as cl
-import pyopencl.clmath  # noqa
-import pyopencl.clrandom  # noqa
+import pyopencl.clmath
+import pyopencl.clrandom
 
 import loopy as lp
 
@@ -77,7 +77,7 @@ def test_sequential_scan(ctx_factory, n, stride):
     knl = lp.fix_parameters(knl, n=n)
     knl = lp.realize_reduction(knl, force_scan=True)
 
-    evt, (a,) = knl(queue)
+    _evt, (a,) = knl(queue)
 
     assert (a.get() == np.cumsum(np.arange(stride*n)**2)[::stride]).all()
 
@@ -108,7 +108,7 @@ def test_scan_with_different_lower_bound_from_sweep(
 
     knl = lp.fix_parameters(knl, sweep_lbound=sweep_lbound, scan_lbound=scan_lbound)
     knl = lp.realize_reduction(knl, force_scan=True)
-    evt, (out,) = knl(queue, n=n)
+    _evt, (out,) = knl(queue, n=n)
 
     assert (out.get()
             == np.cumsum(np.arange(scan_lbound, 2*n+scan_lbound)**2)[::2]).all()
@@ -138,7 +138,7 @@ def test_force_outer_iname_for_scan():
         "[n] -> {[i,j,k]: 0<=k<n and 0<=i<=k and 0<=j<=i}",
         "out[i] = product(j, a[j]) {inames=i:k}")
 
-    knl = lp.add_dtypes(knl, dict(a=np.float32))
+    knl = lp.add_dtypes(knl, {"a": np.float32})
 
     # TODO: Maybe this deserves to work?
     with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
@@ -161,7 +161,7 @@ def test_dependent_domain_scan(ctx_factory):
         """
         )
     knl = lp.realize_reduction(knl, force_scan=True)
-    evt, (a,) = knl(queue, n=100)
+    _evt, (a,) = knl(queue, n=100)
 
     assert (a.get() == np.cumsum(np.arange(200)**2)[::2]).all()
 
@@ -185,13 +185,13 @@ def test_nested_scan(ctx_factory, i_tag, j_tag):
         """)
 
     knl = lp.fix_parameters(knl, n=10)
-    knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag))
+    knl = lp.tag_inames(knl, {"i": i_tag, "j": j_tag})
 
     knl = lp.realize_reduction(knl, force_scan=True)
 
     print(knl)
 
-    evt, (out,) = knl(queue)
+    _evt, (out,) = knl(queue)
 
     print(out)
 
@@ -222,12 +222,12 @@ def test_local_parallel_scan(ctx_factory, n):
         )
 
     knl = lp.fix_parameters(knl, n=n)
-    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.tag_inames(knl, {"i": "l.0"})
     knl = lp.realize_reduction(knl, force_scan=True)
 
-    knl = lp.add_dtypes(knl, dict(a=int))
+    knl = lp.add_dtypes(knl, {"a": int})
 
-    evt, (a,) = knl(queue, a=np.arange(n))
+    _evt, (a,) = knl(queue, a=np.arange(n))
     assert (a == np.cumsum(np.arange(n)**2)).all()
 
 
@@ -244,11 +244,11 @@ def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory):
         )
 
     knl = lp.fix_parameters(knl, n=16)
-    knl = lp.tag_inames(knl, dict(i="l.0"))
+    knl = lp.tag_inames(knl, {"i": "l.0"})
     knl = lp.realize_reduction(knl, force_scan=True)
 
-    knl = lp.add_dtypes(knl, dict(a=int))
-    evt, (out,) = knl(queue, a=np.arange(1, 17))
+    knl = lp.add_dtypes(knl, {"a": int})
+    _evt, (out,) = knl(queue, a=np.arange(1, 17))
 
     assert (out == np.cumsum(np.arange(1, 17)**2)).all()
 
@@ -276,12 +276,12 @@ def test_scan_with_outer_parallel_iname(ctx_factory, sweep_iname_tag):
         "out[k,i] = k + sum(j, j**2)"
         )
 
-    knl = lp.tag_inames(knl, dict(k="l.0", i=sweep_iname_tag))
+    knl = lp.tag_inames(knl, {"k": "l.0", "i": sweep_iname_tag})
     n = 10
     knl = lp.fix_parameters(knl, n=n)
     knl = lp.realize_reduction(knl, force_scan=True)
 
-    evt, (out,) = knl(queue)
+    _evt, (out,) = knl(queue)
 
     inner = np.cumsum(np.arange(n)**2)
 
@@ -300,9 +300,9 @@ def test_scan_data_types(ctx_factory, dtype):
             assumptions="n>=1")
 
     a = np.random.randn(20).astype(dtype)
-    knl = lp.add_dtypes(knl, dict(a=dtype))
+    knl = lp.add_dtypes(knl, {"a": dtype})
     knl = lp.realize_reduction(knl, force_scan=True)
-    evt, (res,) = knl(queue, a=a)
+    _evt, (res,) = knl(queue, a=a)
 
     assert np.allclose(res, np.cumsum(a))
 
@@ -323,9 +323,9 @@ def test_scan_library(ctx_factory, op_name, np_op):
             assumptions="n>=1")
 
     a = np.random.randn(20)
-    knl = lp.add_dtypes(knl, dict(a=np.float64))
+    knl = lp.add_dtypes(knl, {"a": np.float64})
     knl = lp.realize_reduction(knl, force_scan=True)
-    evt, (res,) = knl(queue, a=a)
+    _evt, (res,) = knl(queue, a=a)
 
     assert np.allclose(res, np.array(
             [np_op(a[:i+1]) for i in range(len(a))]))
@@ -351,12 +351,12 @@ def test_argmax(ctx_factory, i_tag):
             max_vals[i], max_indices[i] = argmax(j, abs(a[j]), j)
             """)
 
-    knl = lp.tag_inames(knl, dict(i=i_tag))
+    knl = lp.tag_inames(knl, {"i": i_tag})
     knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
     knl = lp.realize_reduction(knl, force_scan=True)
 
     a = np.random.randn(n).astype(dtype)
-    evt, (max_indices, max_vals) = knl(queue, a=a, out_host=True)
+    _evt, (max_indices, max_vals) = knl(queue, a=a, out_host=True)
 
     assert (max_vals == [np.max(np.abs(a)[0:i+1]) for i in range(n)]).all()
     assert (max_indices == [np.argmax(np.abs(a[0:i+1])) for i in range(n)]).all()
@@ -402,7 +402,7 @@ def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag):
 
     arr = np.ones(n, dtype=np.float32)
     segment_boundaries = np.zeros(n, dtype=np.int32)
-    segment_boundaries[(segment_boundaries_indices,)] = 1
+    segment_boundaries[segment_boundaries_indices,] = 1
 
     knl = lp.make_kernel(
         "{[i,j]: 0<=i<n and 0<=j<=i}",
@@ -414,10 +414,10 @@ def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag):
         ])
 
     knl = lp.fix_parameters(knl, n=n)
-    knl = lp.tag_inames(knl, dict(i=iname_tag))
+    knl = lp.tag_inames(knl, {"i": iname_tag})
     knl = lp.realize_reduction(knl, force_scan=True)
 
-    (evt, (out,)) = knl(queue, arr=arr, segflag=segment_boundaries)
+    (_evt, (out,)) = knl(queue, arr=arr, segflag=segment_boundaries)
 
     check_segmented_scan_output(arr, segment_boundaries_indices, out)
 
diff --git a/test/test_sem_reagan.py b/test/test_sem_reagan.py
index 5ca4a08f3..7969795bf 100644
--- a/test/test_sem_reagan.py
+++ b/test/test_sem_reagan.py
@@ -63,7 +63,7 @@ def test_tim2d(ctx_factory):
             [
                 lp.GlobalArg("u", dtype, shape=field_shape, order=order),
                 lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
-                lp.GlobalArg("G", dtype, shape=(3,)+field_shape, order=order),
+                lp.GlobalArg("G", dtype, shape=(3, *field_shape), order=order),
                 # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order),
                 lp.GlobalArg("D", dtype, shape=(n, n), order=order),
                 # lp.ImageArg("D", dtype, shape=(n, n)),
@@ -78,7 +78,7 @@ def test_tim2d(ctx_factory):
     seq_knl = knl
 
     def variant_orig(knl):
-        knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0"))
+        knl = lp.tag_inames(knl, {"i": "l.0", "j": "l.1", "e": "g.0"})
 
         knl = lp.add_prefetch(knl, "D[:,:]", fetch_outer_inames="e",
                 default_tag="l.auto")
@@ -94,8 +94,8 @@ def variant_orig(knl):
         knl = lp.add_prefetch(knl, "G$x[:,e,:,:]", default_tag="l.auto")
         knl = lp.add_prefetch(knl, "G$y[:,e,:,:]", default_tag="l.auto")
 
-        knl = lp.tag_inames(knl, dict(o="unr"))
-        knl = lp.tag_inames(knl, dict(m="unr"))
+        knl = lp.tag_inames(knl, {"o": "unr"})
+        knl = lp.tag_inames(knl, {"m": "unr"})
 
         knl = lp.set_instruction_priority(knl, "id:D_fetch", 5)
         print(knl)
diff --git a/test/test_split_iname_slabs.py b/test/test_split_iname_slabs.py
index a171d526d..4b9e74af5 100644
--- a/test/test_split_iname_slabs.py
+++ b/test/test_split_iname_slabs.py
@@ -22,7 +22,7 @@
 import numpy as np
 import pytest
 
-import pyopencl as cl  # noqa
+import pyopencl as cl
 import pyopencl.array as clarray
 from pyopencl.tools import (
     pytest_generate_tests_for_pyopencl as pytest_generate_tests,  # noqa
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 0547a77f1..b6d95d60e 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -58,8 +58,8 @@ def test_op_counter_basic():
             name="basic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
-                                  dict(a=np.float32, b=np.float32,
-                                       g=np.float64, h=np.float64))
+                                  {"a": np.float32, "b": np.float32,
+                                       "g": np.float64, "h": np.float64})
     op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
                            count_within_subscripts=True)
     n_workgroups = 1
@@ -95,7 +95,7 @@ def test_op_counter_reduction():
             ],
             name="matmul_serial", assumptions="n,m,ell >= 1")
 
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b": np.float32})
     op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
     n_workgroups = 1
     group_size = 1
@@ -131,7 +131,7 @@ def test_op_counter_logic():
             ],
             name="logic", assumptions="n,m,ell >= 1")
 
-    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
+    knl = lp.add_and_infer_dtypes(knl, {"g": np.float32, "h": np.float64})
     op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
     n_workgroups = 1
     group_size = 1
@@ -169,8 +169,8 @@ def test_op_counter_special_ops():
             name="special_ops", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
-                                  dict(a=np.float32, b=np.float32,
-                                       g=np.float64, h=np.float64))
+                                  {"a": np.float32, "b": np.float32,
+                                       "g": np.float64, "h": np.float64})
     op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
                            count_within_subscripts=True)
     n_workgroups = 1
@@ -218,9 +218,9 @@ def test_op_counter_bitwise():
             name="bitwise", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(
-            knl, dict(
-                a=np.int32, b=np.int32,
-                g=np.int64, h=np.int64))
+            knl, {
+                "a": np.int32, "b": np.int32,
+                "g": np.int64, "h": np.int64})
 
     op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
                            count_within_subscripts=False)
@@ -269,7 +269,7 @@ def test_op_counter_triangular_domain():
             name="bitwise", assumptions="n,m >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
-            dict(b=np.float64))
+            {"b": np.float64})
 
     expect_fallback = False
     import islpy as isl
@@ -285,7 +285,7 @@ def test_op_counter_triangular_domain():
                     subgroup_size=SGS,
                     count_redundant_work=True
                     )[lp.Op(np.float64, OpType.MUL, CG.SUBGROUP, "bitwise")]
-    value_dict = dict(m=13, n=200)
+    value_dict = {"m": 13, "n": 200}
     flops = op_map.eval_with_dict(value_dict)
 
     n_workgroups = 1
@@ -312,7 +312,7 @@ def test_mem_access_counter_basic():
             name="basic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
-                    dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+            {"a": np.float32, "b": np.float32, "g": np.float64, "h": np.float64})
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                     subgroup_size=SGS)
@@ -383,7 +383,7 @@ def test_mem_access_counter_reduction():
             ],
             name="matmul", assumptions="n,m,ell >= 1")
 
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b": np.float32})
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                     subgroup_size=SGS)
@@ -446,7 +446,7 @@ def test_mem_access_counter_logic():
             ],
             name="logic", assumptions="n,m,ell >= 1")
 
-    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
+    knl = lp.add_and_infer_dtypes(knl, {"g": np.float32, "h": np.float64})
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                     subgroup_size=SGS)
@@ -490,8 +490,8 @@ def test_mem_access_counter_special_ops():
             ],
             name="special_ops", assumptions="n,m,ell >= 1")
 
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
-                                            g=np.float64, h=np.float64))
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b": np.float32,
+                                            "g": np.float64, "h": np.float64})
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                     subgroup_size=SGS)
@@ -573,9 +573,9 @@ def test_mem_access_counter_bitwise():
             name="bitwise", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(
-            knl, dict(
-                a=np.int32, b=np.int32,
-                g=np.int32, h=np.int32))
+            knl, {
+                "a": np.int32, "b": np.int32,
+                "g": np.int32, "h": np.int32})
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
                                     subgroup_size=SGS)
@@ -645,9 +645,9 @@ def test_mem_access_counter_mixed():
             ],
             name="mixed", assumptions="n,m,ell >= 1")
 
-    knl = lp.add_and_infer_dtypes(knl, dict(
-                a=np.float32, b=np.float32, g=np.float64, h=np.float64,
-                x=np.float32))
+    knl = lp.add_and_infer_dtypes(knl, {
+                "a": np.float32, "b": np.float32, "g": np.float64, "h": np.float64,
+                "x": np.float32})
 
     group_size_0 = 65
 
@@ -760,14 +760,14 @@ def test_mem_access_counter_nonconsec():
             """
             ],
             name="non_consec", assumptions="n,m,ell >= 1")
-    knl = lp.add_and_infer_dtypes(knl, dict(
-                a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+    knl = lp.add_and_infer_dtypes(knl, {
+                "a": np.float32, "b": np.float32, "g": np.float64, "h": np.float64})
     lsize0 = 16
     knl = lp.split_iname(knl, "i", lsize0)
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=SGS)  # noqa
+                                    subgroup_size=SGS)
     n = 512
     m = 256
     ell = 128
@@ -881,8 +881,8 @@ def test_mem_access_counter_consec():
             """
             ],
             name="consec", assumptions="n,m,ell >= 1")
-    knl = lp.add_and_infer_dtypes(knl, dict(
-                a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+    knl = lp.add_and_infer_dtypes(knl, {
+                "a": np.float32, "b": np.float32, "g": np.float64, "h": np.float64})
     knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
@@ -1010,8 +1010,8 @@ def test_barrier_counter_nobarriers():
             ],
             name="basic", assumptions="n,m,ell >= 1")
 
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
-                                            g=np.float64, h=np.float64))
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b": np.float32,
+                                            "g": np.float64, "h": np.float64})
     sync_map = lp.get_synchronization_map(knl)
     n = 512
     m = 256
@@ -1036,7 +1036,7 @@ def test_barrier_counter_barriers():
             ],
             name="weird2",
             )
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32))
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.int32})
     knl = lp.split_iname(knl, "k", 128, inner_tag="l.0")
     sync_map = lp.get_synchronization_map(knl)
     print(f"{sync_map=}")
@@ -1073,7 +1073,7 @@ def test_all_counters_parallel_matmul():
                 "c[i, j] = sum(k, a[i, k]*b[k, j])"
             ],
             name="matmul", assumptions="n,m,ell >= 1")
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b": np.float32})
     knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
     knl = lp.split_iname(knl, "k", bsize)
@@ -1204,7 +1204,7 @@ def test_floor_div_coefficient_collector():
         name="local",
         lang_version=(2018, 2))
 
-    knl = lp.add_and_infer_dtypes(knl, dict(out0=np.float32))
+    knl = lp.add_and_infer_dtypes(knl, {"out0": np.float32})
     knl = lp.tag_inames(knl, "i_outer:g.1,i_inner:l.1,j_outer:g.0,j_inner:l.0")
 
     n = 512
@@ -1231,7 +1231,7 @@ def test_mem_access_tagged_variables():
                 "c$mmresult[i, j] = sum(k, a$mmaload[i, k]*b$mmbload[k, j])"
             ],
             name="matmul", assumptions="n,m,ell >= 1")
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b": np.float32})
     knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
     knl = lp.split_iname(knl, "k", bsize)
@@ -1295,7 +1295,7 @@ def test_gather_access_footprint():
                 "c[i, j] = sum(k, a[i, k]*b[k, j]) + a[i,j]"
             ],
             name="matmul", assumptions="n >= 1")
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b": np.float32})
 
     from loopy.statistics import count, gather_access_footprints
     fp = gather_access_footprints(knl)
@@ -1309,7 +1309,7 @@ def test_gather_access_footprint_2():
             "{[i]: 0<=i<n}",
             "c[2*i] = a[i]",
             name="matmul", assumptions="n >= 1")
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
 
     from loopy.statistics import count, gather_access_footprints
     fp = gather_access_footprints(knl)
@@ -1333,7 +1333,7 @@ def test_summations_and_filters():
             name="basic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
-                    dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
+            {"a": np.float32, "b": np.float32, "g": np.float64, "h": np.float64})
 
     n = 512
     m = 256
@@ -1429,14 +1429,14 @@ def func_filter(key):
 
 
 def test_strided_footprint():
-    param_dict = dict(n=2**20)
+    param_dict = {"n": 2**20}
     knl = lp.make_kernel(
         "[n] -> {[i]: 0<=i<n}",
         [
             "z[i] = x[3*i]"
         ], name="s3")
 
-    knl = lp.add_and_infer_dtypes(knl, dict(x=np.float32))
+    knl = lp.add_and_infer_dtypes(knl, {"x": np.float32})
 
     unr = 4
     bx = 256
diff --git a/test/test_target.py b/test/test_target.py
index 08bf286cd..a3c4356b0 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -218,7 +218,7 @@ def test_random123(ctx_factory, tp):
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
     knl = lp.set_options(knl, write_code=True)
 
-    evt, (out,) = knl(queue, n=n)
+    _evt, (out,) = knl(queue, n=n)
 
     out = out.get()
     assert (out < 1).all()
@@ -236,7 +236,7 @@ def test_tuple(ctx_factory):
             a, b = make_tuple(1, 2.)
             """)
 
-    evt, (a, b) = knl(queue)
+    _evt, (a, b) = knl(queue)
 
     assert a.get() == 1
     assert b.get() == 2.
@@ -256,7 +256,7 @@ def test_clamp(ctx_factory):
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
     knl = lp.set_options(knl, write_code=True)
 
-    evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15))
+    _evt, (_out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15))
 
 
 def test_sized_integer_c_codegen(ctx_factory):
@@ -272,7 +272,7 @@ def test_sized_integer_c_codegen(ctx_factory):
     knl = lp.set_options(knl, write_code=True)
     n = 40
 
-    evt, (a,) = knl(queue, n=n)
+    _evt, (a,) = knl(queue, n=n)
 
     a_ref = 1 << np.arange(n, dtype=np.int64)
 
@@ -316,7 +316,7 @@ def test_ispc_streaming_stores():
 
     knl = lp.add_and_infer_dtypes(knl, dict.fromkeys(vars, stream_dtype))
 
-    knl = lp.set_argument_order(knl, vars + ["n"])
+    knl = lp.set_argument_order(knl, [*vars, "n"])
 
     lp.generate_code_v2(knl).all_code()
     assert "streaming_store(" in lp.generate_code_v2(knl).all_code()
@@ -348,7 +348,7 @@ def test_pyopencl_execution_numpy_handling(ctx_factory):
 
     y = np.array([3.])
     x = np.array([4.])
-    evt, out = knl(queue, y=y, x=x)
+    _evt, out = knl(queue, y=y, x=x)
     assert out[0] is x
     assert x[0] == 7.
 
@@ -357,7 +357,7 @@ def test_pyopencl_execution_numpy_handling(ctx_factory):
     import pyopencl.array as cla
     y = cla.zeros(queue, shape=(1), dtype="float64") + 3.
     x = np.array([4.])
-    evt, out = knl(queue, y=y, x=x)
+    _evt, out = knl(queue, y=y, x=x)
     assert out[0] is x
     assert x[0] == 7.
 
@@ -366,7 +366,7 @@ def test_pyopencl_execution_numpy_handling(ctx_factory):
 
     y = np.array([3.])
     x = np.array([4.])
-    evt, out = knl(queue, y=y, x=x)
+    _evt, out = knl(queue, y=y, x=x)
     assert out[0] is x
     assert x[0] == 5.
 
@@ -380,7 +380,7 @@ def test_opencl_support_for_bool(ctx_factory):
         [lp.GlobalArg("y", dtype=np.bool_, shape=lp.auto)])
 
     cl_ctx = ctx_factory()
-    evt, (out, ) = knl(cl.CommandQueue(cl_ctx))
+    _evt, (out, ) = knl(cl.CommandQueue(cl_ctx))
     out = out.get()
 
     np.testing.assert_equal(out, np.tile(np.array([0, 1], dtype=np.bool_), 5))
@@ -405,16 +405,16 @@ def test_nan_support(ctx_factory, target):
          lp.Assignment(parse("g"), NaN(np.complex64)),
          lp.Assignment(parse("h"), NaN(np.complex128)),
          ],
-        [lp.GlobalArg("a", is_input=False, shape=tuple()), ...],
+        [lp.GlobalArg("a", is_input=False, shape=()), ...],
         seq_dependencies=True, target=target())
 
     knl = lp.set_options(knl, return_dict=True)
 
     if target == lp.PyOpenCLTarget:
-        evt, out_dict = knl(queue)
+        _evt, out_dict = knl(queue)
         out_dict = {k: v.get() for k, v in out_dict.items()}
     elif target == lp.ExecutableCTarget:
-        evt, out_dict = knl()
+        _evt, out_dict = knl()
     else:
         raise NotImplementedError("unsupported target")
 
@@ -451,9 +451,9 @@ def test_emits_ternary_operators_correctly(ctx_factory, target):
     knl = lp.set_options(knl, return_dict=True)
 
     if target == lp.PyOpenCLTarget:
-        evt, out_dict = knl(queue)
+        _evt, out_dict = knl(queue)
     elif target == lp.ExecutableCTarget:
-        evt, out_dict = knl()
+        _evt, out_dict = knl()
     else:
         raise NotImplementedError("unsupported target")
 
@@ -479,7 +479,7 @@ def test_scalar_array_take_offset(ctx_factory):
     x_in_base = cla.arange(cq, 42, dtype=np.int32)
     x_in = x_in_base[13]
 
-    evt, (out,) = knl(cq, x=x_in)
+    _evt, (out,) = knl(cq, x=x_in)
     np.testing.assert_allclose(out.get(), 1729)
 
 
@@ -630,7 +630,7 @@ def test_glibc_bessel_functions(dtype):
         second_kind_bessel[i] = bessel_yn(n, x[i])
         """, target=lp.ExecutableCWithGNULibcTarget(compiler))
 
-    if knl.target.compiler.toolchain.cc not in ["gcc", "g++"]:  # pylint: disable=no-member  # noqa: E501
+    if knl.target.compiler.toolchain.cc not in ["gcc", "g++"]:  # pylint: disable=no-member
         pytest.skip("GNU-libc not found.")
 
     knl = lp.fix_parameters(knl, n=2)
@@ -772,7 +772,7 @@ def test_passing_bajillions_of_svm_args(ctx_factory, with_gbarrier):
                 cl.array.zeros(queue, 20, np.float32, allocator=alloc)
                 + np.float32(iargset))
 
-    evt, res = knl(queue, **args, allocator=alloc)
+    _evt, res = knl(queue, **args, allocator=alloc)
 
     for iargset in range(nargsets):
         assert (res[f"c{iargset}"].get() == iargset * multiplier + iargset).all()
diff --git a/test/test_transform.py b/test/test_transform.py
index 11ec37159..3e3aabf14 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -27,8 +27,8 @@
 import pytest
 
 import pyopencl as cl
-import pyopencl.clmath  # noqa
-import pyopencl.clrandom  # noqa
+import pyopencl.clmath
+import pyopencl.clrandom
 from pytools.tag import Tag
 
 import loopy as lp
@@ -111,14 +111,14 @@ def test_collect_common_factors(ctx_factory):
             out[i] = out_tmp {dep=out_up1:out_up2}
             """)
     knl = lp.add_and_infer_dtypes(knl,
-            dict(a=np.float32, alpha=np.float32, b1=np.float32, b2=np.float32))
+            {"a": np.float32, "alpha": np.float32, "b1": np.float32, "b2": np.float32})
 
     ref_knl = knl
 
     knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0")
     knl = lp.collect_common_factors_on_increment(knl, "out_tmp")
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=13))
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 13})
 
 
 def test_to_batched(ctx_factory):
@@ -128,25 +128,25 @@ def test_to_batched(ctx_factory):
     knl = lp.make_kernel(
          """ { [i,j]: 0<=i,j<n } """,
          """ out[i] = sum(j, a[i,j]*x[j])""")
-    knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32,
-                                            x=np.float32,
-                                            a=np.float32))
+    knl = lp.add_and_infer_dtypes(knl, {"out": np.float32,
+                                            "x": np.float32,
+                                            "a": np.float32})
 
     bknl = lp.to_batched(knl, "nbatches", "out,x")
 
     ref_knl = lp.make_kernel(
          """ { [i,j,k]: 0<=i,j<n and 0<=k<nbatches} """,
          """out[k, i] = sum(j, a[i,j]*x[k, j])""")
-    ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32,
-                                                    x=np.float32,
-                                                    a=np.float32))
+    ref_knl = lp.add_and_infer_dtypes(ref_knl, {"out": np.float32,
+                                                    "x": np.float32,
+                                                    "a": np.float32})
 
     a = np.random.randn(5, 5).astype(np.float32)
     x = np.random.randn(7, 5).astype(np.float32)
 
     # Running both the kernels
-    evt, (out1, ) = bknl(queue, a=a, x=x, n=5, nbatches=7)
-    evt, (out2, ) = ref_knl(queue, a=a, x=x, n=5, nbatches=7)
+    _evt, (out1, ) = bknl(queue, a=a, x=x, n=5, nbatches=7)
+    _evt, (out2, ) = ref_knl(queue, a=a, x=x, n=5, nbatches=7)
 
     # checking that the outputs are same
     assert np.linalg.norm(out1-out2) < 1e-15
@@ -164,15 +164,15 @@ def test_to_batched_temp(ctx_factory):
              dtype=np.float32,
              shape=(),
              address_space=lp.AddressSpace.PRIVATE), "..."])
-    knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32,
-                                            x=np.float32,
-                                            a=np.float32))
+    knl = lp.add_and_infer_dtypes(knl, {"out": np.float32,
+                                            "x": np.float32,
+                                            "a": np.float32})
     ref_knl = lp.make_kernel(
          """ { [i,j]: 0<=i,j<n } """,
          """out[i] = sum(j, 2.0*a[i,j]*x[j])""")
-    ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32,
-                                                    x=np.float32,
-                                                    a=np.float32))
+    ref_knl = lp.add_and_infer_dtypes(ref_knl, {"out": np.float32,
+                                                    "x": np.float32,
+                                                    "a": np.float32})
 
     bknl = lp.to_batched(knl, "nbatches", "out,x")
     bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x")
@@ -186,7 +186,7 @@ def test_to_batched_temp(ctx_factory):
     # Checking that the program compiles and the logic is correct
     lp.auto_test_vs_ref(
             bref_knl, ctx, bknl,
-            parameters=dict(a=a, x=x, n=5, nbatches=7))
+            parameters={"a": a, "x": x, "n": 5, "nbatches": 7})
 
 
 def test_add_barrier(ctx_factory):
@@ -211,7 +211,7 @@ def test_add_barrier(ctx_factory):
     knl = lp.split_iname(knl, "ii", 2, outer_tag="g.0", inner_tag="l.0")
     knl = lp.split_iname(knl, "jj", 2, outer_tag="g.1", inner_tag="l.1")
 
-    evt, (out,) = knl(queue, a=a)
+    _evt, (out,) = knl(queue, a=a)
     assert (np.linalg.norm(out-2*a.T) < 1e-16)
 
 
@@ -225,7 +225,7 @@ def test_rename_argument(ctx_factory):
 
     kernel = lp.rename_argument(kernel, "a", "b")
 
-    evt, (out,) = kernel(queue, b=np.float32(12), n=20)
+    _evt, (out,) = kernel(queue, b=np.float32(12), n=20)
 
     assert (np.abs(out.get() - 14) < 1e-8).all()
 
@@ -277,7 +277,7 @@ def test_alias_temporaries(ctx_factory):
     knl = lp.allocate_temporaries_for_base_storage(knl)
     lp.auto_test_vs_ref(
             ref_knl, ctx, knl,
-            parameters=dict(n=30))
+            parameters={"n": 30})
 
 
 def test_vectorize(ctx_factory):
@@ -289,10 +289,10 @@ def test_vectorize(ctx_factory):
         <> temp = 2*b[i]
         a[i] = temp
         """)
-    knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32))
+    knl = lp.add_and_infer_dtypes(knl, {"b": np.float32})
     knl = lp.set_array_axis_names(knl, "a,b", "i")
     knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4,
-            split_kwargs=dict(slabs=(0, 1)))
+            split_kwargs={"slabs": (0, 1)})
 
     knl = lp.tag_array_axes(knl, "a,b", "c,vec")
     ref_knl = knl
@@ -301,11 +301,11 @@ def test_vectorize(ctx_factory):
     knl = lp.tag_inames(knl, {"i_inner": "vec"})
 
     knl = lp.preprocess_kernel(knl)
-    code, inf = lp.generate_code(knl)
+    _code, _inf = lp.generate_code(knl)
 
     lp.auto_test_vs_ref(
             ref_knl, ctx, knl,
-            parameters=dict(n=30))
+            parameters={"n": 30})
 
 
 def test_extract_subst(ctx_factory):
@@ -363,10 +363,10 @@ def test_tag_data_axes(ctx_factory):
         lp.tag_array_axes(knl, "out", "N1,N0,c")
 
     knl = lp.tag_array_axes(knl, "out", "N1,N0,N2")
-    knl = lp.tag_inames(knl, dict(j="g.0", i="g.1"))
+    knl = lp.tag_inames(knl, {"j": "g.0", "i": "g.1"})
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
-            parameters=dict(n=20))
+            parameters={"n": 20})
 
 
 def test_set_arg_order():
@@ -393,7 +393,7 @@ class BarTag(UniqueTag):
 
     knl = t_unit.default_entrypoint
 
-    tags = knl.iname_tags("i")
+    knl.iname_tags("i")
     assert not knl.iname_tags_of_type("i", FooTag)
     assert not knl.iname_tags_of_type("i", BarTag)
 
@@ -428,11 +428,11 @@ def test_precompute_confusing_subst_arguments(ctx_factory):
         b[i,j] = D(j)
         """, name="precomputer")
 
-    prog = lp.add_and_infer_dtypes(prog, dict(a=np.float32))
+    prog = lp.add_and_infer_dtypes(prog, {"a": np.float32})
 
     ref_prog = prog
 
-    prog = lp.tag_inames(prog, dict(j="g.1"))
+    prog = lp.tag_inames(prog, {"j": "g.1"})
     prog = lp.split_iname(prog, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
     from loopy.symbolic import get_dependencies
@@ -444,7 +444,7 @@ def test_precompute_confusing_subst_arguments(ctx_factory):
 
     lp.auto_test_vs_ref(
             ref_prog, ctx, prog,
-            parameters=dict(n=12345))
+            parameters={"n": 12345})
 
 
 def test_precompute_nested_subst(ctx_factory):
@@ -458,7 +458,7 @@ def test_precompute_nested_subst(ctx_factory):
         b[i] = D
         """, name="precomputer")
 
-    prog = lp.add_and_infer_dtypes(prog, dict(a=np.float32))
+    prog = lp.add_and_infer_dtypes(prog, {"a": np.float32})
 
     ref_prog = prog
 
@@ -482,7 +482,7 @@ def test_precompute_nested_subst(ctx_factory):
 
     lp.auto_test_vs_ref(
             ref_prog, ctx, prog,
-            parameters=dict(n=12345))
+            parameters={"n": 12345})
 
 
 def test_precompute_with_preexisting_inames(ctx_factory):
@@ -517,7 +517,7 @@ def test_precompute_with_preexisting_inames(ctx_factory):
 
     lp.auto_test_vs_ref(
             ref_knl, ctx, knl,
-            parameters=dict(E=200))
+            parameters={"E": 200})
 
 
 def test_precompute_with_preexisting_inames_fail():
@@ -798,13 +798,7 @@ def test_map_domain_transform_map_validity_and_errors(ctx_factory):
     # Prioritize loops
     desired_prio = "x, t_outer, t_inner, z, y_new"
 
-    # Use constrain_loop_nesting if it's available
-    cln_attr = getattr(lp, "constrain_loop_nesting", None)
-    if cln_attr is not None:
-        knl_map_dom = lp.constrain_loop_nesting(  # noqa pylint:disable=no-member
-            knl_map_dom, desired_prio)
-    else:
-        knl_map_dom = lp.prioritize_loops(knl_map_dom, desired_prio)
+    knl_map_dom = lp.prioritize_loops(knl_map_dom, desired_prio)
 
     # Get a linearization
     proc_knl_map_dom = lp.preprocess_kernel(knl_map_dom)
@@ -818,11 +812,7 @@ def test_map_domain_transform_map_validity_and_errors(ctx_factory):
     knl_split_iname = ref_knl
     knl_split_iname = lp.split_iname(knl_split_iname, "t", 16)
     knl_split_iname = lp.rename_iname(knl_split_iname, "y", "y_new")
-    try:
-        # Use constrain_loop_nesting if it's available
-        knl_split_iname = lp.constrain_loop_nesting(knl_split_iname, desired_prio)
-    except AttributeError:
-        knl_split_iname = lp.prioritize_loops(knl_split_iname, desired_prio)
+    knl_split_iname = lp.prioritize_loops(knl_split_iname, desired_prio)
     proc_knl_split_iname = lp.preprocess_kernel(knl_split_iname)
     lin_knl_split_iname = lp.get_one_linearized_kernel(
         proc_knl_split_iname["loopy_kernel"], proc_knl_split_iname.callables_table)
@@ -1164,7 +1154,7 @@ def test_rename_argument_with_auto_stride(ctx_factory):
     assert code_str.find("double const *__restrict__ x_new,") != -1
     assert code_str.find("double const *__restrict__ x,") == -1
 
-    evt, (out, ) = knl(queue, x_new=np.random.rand(10))
+    _evt, (_out, ) = knl(queue, x_new=np.random.rand(10))
 
 
 def test_rename_argument_with_assumptions():