diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6c47aa63f..2b4572611 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -141,7 +141,7 @@ jobs:
     downstream_tests:
         strategy:
             matrix:
-                downstream_project: [meshmode, grudge, pytential, pytato]
+                downstream_project: [arraycontext, meshmode, grudge, pytential, pytato]
             fail-fast: false
         name: Tests for downstream project ${{ matrix.downstream_project }}
         runs-on: ubuntu-latest
@@ -190,4 +190,14 @@ jobs:
 
                 pytest --tb=native -rsxw --durations=10 -m 'not parallel' tests/multigrid/
 
+    validate_cff:
+            name: Validate CITATION.cff
+            runs-on: ubuntu-latest
+            steps:
+            -   uses: actions/checkout@v3
+            -   uses: actions/setup-python@v4
+            -   run: |
+                    pip install cffconvert
+                    cffconvert -i CITATION.cff --validate
+
 # vim: sw=4
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d94a9f484..3b9817a2b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,9 +1,4 @@
-stages:
-  - test
-  - deploy
-
 Pytest POCL:
-  stage: test
   script:
   - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako"
@@ -20,7 +15,6 @@ Pytest POCL:
       junit: test/pytest.xml
 
 Pytest Nvidia Titan V:
-  stage: test
   script:
   - export PYOPENCL_TEST=nvi:titan
   - export EXTRA_INSTALL="pybind11 numpy mako"
@@ -38,7 +32,6 @@ Pytest Nvidia Titan V:
       junit: test/pytest.xml
 
 Pytest POCL without arg check:
-  stage: test
   script:
   - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako"
@@ -56,7 +49,6 @@ Pytest POCL without arg check:
       junit: test/pytest.xml
 
 Pytest Intel:
-  stage: test
   script:
   - export PYOPENCL_TEST=intel
   - export EXTRA_INSTALL="pybind11 numpy mako"
@@ -76,7 +68,6 @@ Pytest Intel:
 
 
 Pytest POCL Twice With Cache:
-  stage: test
   script: |
     export PYOPENCL_TEST=portable:pthread
     export EXTRA_INSTALL="pybind11 numpy mako"
@@ -109,7 +100,6 @@ Pytest POCL Twice With Cache:
 #   - tags
 
 Pytest POCL Examples:
-  stage: test
   script: |
     export PYOPENCL_TEST=portable:pthread
     export EXTRA_INSTALL="pybind11 numpy mako"
@@ -134,7 +124,6 @@ Pytest POCL Examples:
   - tags
 
 Pylint:
-  stage: test
   script:
   # Needed to avoid name shadowing issues when running from source directory.
   - PROJECT_INSTALL_FLAGS="--editable"
@@ -147,7 +136,6 @@ Pylint:
   - tags
 
 Documentation:
-  stage: deploy
   script: |
     EXTRA_INSTALL="pybind11 numpy"
     curl -L -O https://tiker.net/ci-support-v0
@@ -160,7 +148,6 @@ Documentation:
   - python3
 
 Flake8:
-  stage: test
   script:
   - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
   - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test examples
@@ -182,26 +169,10 @@ Mypy:
   except:
   - tags
 
-Benchmarks:
-  stage: test
-  script:
-  - CONDA_ENVIRONMENT=.test-conda-env-py3.yml
-  - PROJECT=loopy
-  - PYOPENCL_TEST=portable:pthread
-  - export LOOPY_NO_CACHE=1
-  - export ASV_FACTOR=1.5
-  - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-benchmark-py-project.sh
-  - ". ./build-and-benchmark-py-project.sh"
-  tags:
-  - linux
-  - benchmark
-  except:
-  - tags
-
 Downstream:
   parallel:
     matrix:
-    - DOWNSTREAM_PROJECT: [meshmode, grudge, pytential, pytato]
+    - DOWNSTREAM_PROJECT: [arraycontext, meshmode, grudge, pytential, pytato]
   tags:
   - large-node
   - "docker-runner"
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 000000000..31bef5e73
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,103 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+
+# major contributors
+
+- family-names: "Kloeckner"
+  given-names: "Andreas"
+  orcid: "https://orcid.org/0000-0003-1228-519X"
+- family-names: Kulkarni
+  given-names: Kaushik
+  email:  kaushikcfd@gmail.com
+- family-names: Kempf
+  given-names: Dominic
+  email: dominic.r.kempf@gmail.com
+- family-names: Wala
+  given-names: Matt
+  email: wala1@illinois.edu
+- family-names: Curtis
+  given-names: Nick
+  email: arghdos@gmail.com
+- family-names: Stevens
+  given-names: James
+  email: jdsteve2@illinois.edu
+- family-names: Fernando
+  given-names: Isuru
+  email: isuruf@gmail.com
+
+# smaller fixes
+
+- family-names: Mitchell
+  given-names: Lawrence
+  email: lawrence@wence.uk
+- family-names: Alvey-Blanco
+  given-names: Addison J.
+  email: aalveyblanco@gmail.com
+- family-names: Fikl
+  given-names: Alexandru
+  email: alexfikl@gmail.com
+- family-names: Malone
+  given-names: Chris
+  email: chris.m.malone@gmail.com
+- family-names: Ward
+  given-names: Connor
+  email: c.ward20@imperial.ac.uk
+- family-names: Wilcox
+  given-names: Lucas C.
+  email: lucas@swirlee.com
+- family-names: Koch
+  given-names: Marcel
+  email: marcel.koch@uni-muenster.de
+- family-names: Woodman
+  given-names: Marmaduke
+  email: marmaduke.woodman@univ-amu.fr
+- family-names: Smith
+  given-names: Matthew
+  email: mjsmith6@illinois.edu
+- family-names: Diener
+  given-names: Matthias
+  email: mdiener@illinois.edu
+- family-names: Christensen
+  given-names: Nicholas
+  email: njchris2@illinois.edu
+- family-names: Nykto
+  given-names: Nicolas
+  email: nnytko2@illinois.edu
+- family-names: Kirby
+  given-names: Robert C.
+  email: Robert_Kirby@baylor.edu
+- family-names: Hegmann
+  given-names: Sebastian
+  email: shegmann@nina.iwr.uni-heidelberg.de
+- family-names: Vorderwuelbecke
+  given-names: Sophia
+  email: sv2518@ic.ac.uk
+- family-names: Ratnayaka
+  given-names: Thilina
+  email: thilinarmtb@gmail.com
+- family-names: Gibson
+  given-names: Thomas
+  email: gibsonthomas1120@hotmail.com
+- family-names: Sun
+  given-names: Tianjiao
+  email: tj-sun@tianjiaos-air.home
+- family-names: Smith
+  given-names: Timothy A.
+  email: tasmith4@illinois.edu
+- family-names: Warburton
+  given-names: Tim
+  email: timwar@caam.rice.edu
+- family-names: Wei
+  given-names: Xiaoyu
+  email: wxy0516@gmail.com
+- family-names: Weiner
+  given-names: Zach
+  email: zachjweiner@gmail.com
+
+title: "Loopy"
+version: 2024.1
+date-released: 2024-02-16
+url: "https://github.com/inducer/loopy"
+doi: 10.5281/zenodo.10672275
+license: MIT
diff --git a/MANIFEST.in b/MANIFEST.in
index 293d43ffc..a87cfef7d 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -19,4 +19,5 @@ include configure.py
 include Makefile.in
 include README.rst
 include LICENSE
+include CITATION.cff
 include requirements*.txt
diff --git a/README.rst b/README.rst
index dd9c8cf76..1ef7773db 100644
--- a/README.rst
+++ b/README.rst
@@ -10,6 +10,9 @@ Loopy: Transformation-Based Generation of High-Performance CPU/GPU Code
 .. image:: https://badge.fury.io/py/loopy.png
     :alt: Python Package Index Release Page
     :target: https://pypi.org/project/loopy/
+.. image:: https://zenodo.org/badge/20281732.svg
+    :alt: Zenodo DOI for latest release
+    :target: https://zenodo.org/doi/10.5281/zenodo.10672274
 
 Loopy lets you easily generate the tedious, complicated code that is necessary
 to get good performance out of GPUs and multi-core CPUs.
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/benchmarks/run_sumpy_kernels.py b/benchmarks/run_sumpy_kernels.py
deleted file mode 100644
index 72c61a539..000000000
--- a/benchmarks/run_sumpy_kernels.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import loopy as lp
-import numpy as np
-import pyopencl as cl
-import logging
-from dataclasses import dataclass
-import time
-
-logger = logging.getLogger(__name__)
-
-from pyopencl.tools import (  # noqa
-    pytest_generate_tests_for_pyopencl as pytest_generate_tests,
-)
-
-
-def _sumpy_kernel_init(param):
-    name, dim, order = param.name, param.dim, param.order
-    # TODO: add other kernels
-    assert name == "m2l"
-    from sumpy.expansion.multipole import (
-        LinearPDEConformingVolumeTaylorMultipoleExpansion,
-    )
-    from sumpy.expansion.local import LinearPDEConformingVolumeTaylorLocalExpansion
-    from sumpy.kernel import LaplaceKernel
-    from sumpy import E2EFromCSR
-
-    ctx = cl.create_some_context()
-    np.random.seed(17)
-
-    knl = LaplaceKernel(dim)
-    local_expn_class = LinearPDEConformingVolumeTaylorLocalExpansion
-    mpole_expn_class = LinearPDEConformingVolumeTaylorMultipoleExpansion
-    m_expn = mpole_expn_class(knl, order=order)
-    l_expn = local_expn_class(knl, order=order)
-
-    m2l = E2EFromCSR(ctx, m_expn, l_expn, name="loopy_kernel")
-    m2l.get_translation_loopy_insns()
-    m2l.ctx = None
-    m2l.device = None
-    return m2l
-
-
-def _sumpy_kernel_make(expn, param):
-    assert param.name == "m2l"
-    loopy_knl = expn.get_optimized_kernel()
-    loopy_knl = lp.add_and_infer_dtypes(
-        loopy_knl,
-        dict(
-            tgt_ibox=np.int32,
-            centers=np.float64,
-            tgt_center=np.float64,
-            target_boxes=np.int32,
-            src_ibox=np.int32,
-            src_expansions=np.float64,
-            tgt_rscale=np.float64,
-            src_rscale=np.float64,
-            src_box_starts=np.int32,
-            src_box_lists=np.int32,
-        ),
-    )
-    return loopy_knl
-
-
-@dataclass(frozen=True)
-class Param:
-    name: str
-    dim: int
-    order: int
-
-
-def cached_data(params):
-    data = {}
-    np.random.seed(17)
-    logging.basicConfig(level=logging.INFO)
-    for param in params:
-        data[param] = {}
-        expn = _sumpy_kernel_init(param)
-        data[param]["setup"] = expn
-        knl = _sumpy_kernel_make(expn, param)
-        knl = lp.preprocess_kernel(knl)
-        data[param]["instantiated"] = knl
-        scheduled = knl.with_kernel(lp.get_one_scheduled_kernel(knl["loopy_kernel"],
-                                               knl.callables_table))
-        data[param]["scheduled"] = scheduled
-    return data
-
-
-class SumpyBenchmarkSuite:
-
-    params = [
-        Param("m2l", dim=3, order=6),
-        Param("m2l", dim=3, order=12),
-    ]
-
-    param_names = ["test_name"]
-
-    version = 1
-
-    def setup_cache(self):
-        return cached_data(self.params)
-
-    def time_instantiate(self, data, param):
-        knl = _sumpy_kernel_make(data[param]["setup"], param)
-        lp.preprocess_kernel(knl)
-
-    def time_schedule(self, data, param):
-        knl = data[param]["instantiated"]
-        knl.with_kernel(lp.get_one_scheduled_kernel(knl["loopy_kernel"],
-                                                    knl.callables_table))
-
-    def time_generate_code(self, data, param):
-        lp.generate_code_v2(data[param]["scheduled"])
-
-    time_instantiate.timeout = 600.0
-    time_schedule.timeout = 600.0
-    time_generate_code.timeout = 600.0
-
-    # Use CPU time as the timer
-    time_instantiate.timer = time.process_time
-    time_schedule.timer = time.process_time
-    time_generate_code.timer = time.process_time
-
-    # No warmup is needed
-    time_instantiate.warmup_time = 0
-    time_schedule.warmup_time = 0
-    time_generate_code.warmup_time = 0
-
-    # Run memory benchmarks as well
-    peakmem_instantiate = time_instantiate
-    peakmem_schedule = time_schedule
-    peakmem_generate_code = time_generate_code
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index c53c56530..1fa237b25 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -200,6 +200,8 @@ Tag                             Meaning
 ``"l.N"``                       Local (intra-group) axis N ("local")
 ``"g.N"``                       Group-number axis N ("group")
 ``"unr"``                       Unroll
+``"unr_hint"``                  Unroll using compiler directives
+``"unr_hint.N"``                Unroll at most N times using compiler directives
 ``"ilp"`` | ``"ilp.unr"``       Unroll using instruction-level parallelism
 ``"ilp.seq"``                   Realize parallel iname as innermost loop
 ``"like.INAME"``                Can be used when tagging inames to tag like another
@@ -535,8 +537,6 @@ have the lifetime of a kernel invocation.
 .. autoclass:: AddressSpace
 
 .. autoclass:: TemporaryVariable
-    :members:
-    :undoc-members:
 
 .. _types:
 
diff --git a/doc/ref_other.rst b/doc/ref_other.rst
index b13f39869..8ce3be0ca 100644
--- a/doc/ref_other.rst
+++ b/doc/ref_other.rst
@@ -16,10 +16,11 @@ Controlling caching
 Running Kernels
 ---------------
 
-In addition to simply calling kernels using :meth:`LoopKernel.__call__`,
-the following underlying functionality may be used:
+Use :class:`TranslationUnit.executor` to bind a translation unit
+to execution resources, and then use :class:`ExecutorBase.__call__`
+to invoke the kernel.
 
-.. autoclass:: CompiledKernel
+.. autoclass:: ExecutorBase
 
 Automatic Testing
 -----------------
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 8e65e4591..617aef6f8 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -29,6 +29,8 @@ import a few modules and set up a :class:`pyopencl.Context` and a
 
     >>> from warnings import filterwarnings, catch_warnings
     >>> filterwarnings('error', category=lp.LoopyWarning)
+    >>> from loopy.diagnostic import DirectCallUncachedWarning
+    >>> filterwarnings('ignore', category=DirectCallUncachedWarning)
 
     >>> ctx = cl.create_some_context(interactive=False)
     >>> queue = cl.CommandQueue(ctx)
@@ -1057,7 +1059,6 @@ earlier:
         acc_k = 0.0f;
       if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
         a_fetch[lid(0)] = a[16 * gid(0) + lid(0)];
-      barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */;
       if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
       {
         for (int k = 0; k <= 15; ++k)
@@ -1466,7 +1467,7 @@ We'll also request a prefetch--but suppose we only do so across the
 
 .. doctest::
 
-    >>> knl = lp.add_prefetch(knl, "a", "i_inner")
+    >>> knl = lp.add_prefetch(knl, "a", "i_inner", default_tag="l.auto")
 
 When we try to run our code, we get the following warning from loopy as a first
 sign that something is amiss:
diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py
index 3458a6e0e..486536cc0 100644
--- a/examples/python/hello-loopy.py
+++ b/examples/python/hello-loopy.py
@@ -24,7 +24,11 @@
 
 # execute
 # -------
+# easy, slower:
 evt, (out,) = knl(queue, a=a)
+# efficient, with caching:
+knl_ex = knl.executor(ctx)
+evt, (out,) = knl_ex(queue, a=a)
 # ENDEXAMPLE
 
 knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.float32)})
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 4796c1f59..e5aa4259a 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -28,6 +28,7 @@
 
 # {{{ imported user interface
 
+from loopy.typing import auto
 from loopy.kernel.instruction import (
         LegacyStringInstructionTag, UseStreamingStoreTag,
         MemoryOrdering,
@@ -37,7 +38,6 @@
         MultiAssignmentBase, Assignment,
         CallInstruction, CInstruction, NoOpInstruction, BarrierInstruction)
 from loopy.kernel.data import (
-        auto,
         KernelArgument,
         ValueArg, ArrayArg, GlobalArg, ConstantArg, ImageArg,
         AddressSpace,
@@ -147,7 +147,6 @@
 from loopy.codegen.result import (
         GeneratedProgram,
         CodeGenerationResult)
-from loopy.compiled import CompiledKernel
 from loopy.options import Options
 from loopy.auto_test import auto_test_vs_ref
 from loopy.frontend.fortran import (c_preprocess, parse_transformed_fortran,
@@ -162,7 +161,10 @@
 from loopy.target.pyopencl import PyOpenCLTarget
 from loopy.target.ispc import ISPCTarget
 
-from loopy.tools import Optional, t_unit_to_python, memoize_on_disk
+from loopy.tools import (Optional, t_unit_to_python, memoize_on_disk,
+                         clear_in_mem_caches)
+
+from loopy.target.execution import ExecutorBase
 
 
 __all__ = [
@@ -293,8 +295,6 @@
         "gather_access_footprints", "gather_access_footprint_bytes",
         "Sync",
 
-        "CompiledKernel",
-
         "auto_test_vs_ref",
 
         "Options",
@@ -311,7 +311,9 @@
         "PyOpenCLTarget", "ISPCTarget",
         "ASTBuilderBase",
 
-        "Optional", "memoize_on_disk",
+        "Optional", "memoize_on_disk", "clear_in_mem_caches",
+
+        "ExecutorBase",
 
         # {{{ from this file
 
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 01e84b4a5..32f89992a 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -34,7 +34,7 @@
 from pytools import UniqueNameGenerator
 
 from pytools.persistent_dict import WriteOncePersistentDict
-from loopy.tools import LoopyKeyBuilder
+from loopy.tools import LoopyKeyBuilder, caches
 from loopy.version import DATA_MODEL_VERSION
 from loopy.types import LoopyType
 from loopy.typing import ExpressionT
@@ -315,6 +315,9 @@ def ast_builder(self):
          key_builder=LoopyKeyBuilder())
 
 
+caches.append(code_gen_cache)
+
+
 class InKernelCallablesCollector(CombineMapper):
     """
     Returns an instance of :class:`frozenset` containing instances of
@@ -493,7 +496,7 @@ def diverge_callee_entrypoints(program):
 
         new_callables[name] = clbl
 
-    return program.copy(callables_table=new_callables)
+    return program.copy(callables_table=Map(new_callables))
 
 
 @dataclass(frozen=True)
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 349f82ebd..29a7d6d72 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -24,6 +24,7 @@
 """
 
 import islpy as isl
+from functools import partial
 
 from loopy.codegen.result import merge_codegen_results, wrap_in_if
 from loopy.schedule import (
@@ -72,7 +73,7 @@ def generate_code_for_sched_index(codegen_state, sched_index):
     elif isinstance(sched_item, EnterLoop):
         from loopy.kernel.data import (UnrolledIlpTag, UnrollTag,
                 ForceSequentialTag, LoopedIlpTag, VectorizeTag,
-                InameImplementationTag,
+                InameImplementationTag, UnrollHintTag,
                 InOrderSequentialSequentialTag, filter_iname_tags_by_type)
 
         tags = kernel.iname_tags_of_type(sched_item.iname, InameImplementationTag)
@@ -87,9 +88,14 @@ def generate_code_for_sched_index(codegen_state, sched_index):
             func = generate_unroll_loop
         elif filter_iname_tags_by_type(tags, VectorizeTag):
             func = generate_vectorize_loop
+        elif filter_iname_tags_by_type(tags, UnrollHintTag):
+            unroll_tags = filter_iname_tags_by_type(tags, UnrollHintTag)
+            hints = [codegen_state.ast_builder.emit_unroll_hint(tag.value)
+                    for tag in unroll_tags]
+            func = partial(generate_sequential_loop_dim_code, hints=hints)
         elif not tags or filter_iname_tags_by_type(tags, (LoopedIlpTag,
                     ForceSequentialTag, InOrderSequentialSequentialTag)):
-            func = generate_sequential_loop_dim_code
+            func = partial(generate_sequential_loop_dim_code, hints=[])
         else:
             raise RuntimeError("encountered (invalid) EnterLoop "
                     "for '%s', tagged '%s'"
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 2dfb532f2..d76ffc121 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -287,14 +287,12 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
 
     result = []
 
-    bounds = kernel.get_iname_bounds(iname)
     domain = kernel.get_inames_domain(iname)
 
     # It's ok to find a bound that's too "loose". The conditional
     # generators will mop up after us.
-    from loopy.isl_helpers import static_min_of_pw_aff
-    lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff,
-            constants_only=False)
+    from loopy.kernel.tools import get_hw_axis_base_for_codegen
+    lower_bound = get_hw_axis_base_for_codegen(kernel, iname)
 
     # These bounds are 'implemented' by the hardware. Make sure
     # that the downstream conditional generators realize that.
@@ -345,7 +343,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
 
 # {{{ sequential loop
 
-def generate_sequential_loop_dim_code(codegen_state, sched_index):
+def generate_sequential_loop_dim_code(codegen_state, sched_index, hints):
     kernel = codegen_state.kernel
 
     ecm = codegen_state.expression_to_code_mapper
@@ -479,7 +477,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index):
                         codegen_state, loop_iname, kernel.index_dtype,
                         pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)),
                         pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)),
-                        inner_ast)))
+                        inner_ast, hints)))
 
     return merge_codegen_results(codegen_state, result)
 
diff --git a/loopy/compiled.py b/loopy/compiled.py
deleted file mode 100644
index 0fa18eacb..000000000
--- a/loopy/compiled.py
+++ /dev/null
@@ -1,41 +0,0 @@
-__copyright__ = "Copyright (C) 2016 Andreas Kloeckner"
-
-__license__ = """
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-"""
-
-
-from loopy.target.pyopencl_execution import (  # noqa
-        PyOpenCLKernelExecutor)
-
-
-# {{{ compatibility
-
-class CompiledKernel(PyOpenCLKernelExecutor):
-    """
-    .. automethod:: __call__
-    """
-    def __init__(self, context, kernel, entrypoint):
-        from warnings import warn
-        warn("CompiledKernel is deprecated. Use LoopKernel.__call__ directly.",
-                DeprecationWarning, stacklevel=2)
-
-        super().__init__(context, kernel, entrypoint)
-
-# }}}
diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py
index c81d38c34..e7a93cd11 100644
--- a/loopy/diagnostic.py
+++ b/loopy/diagnostic.py
@@ -45,6 +45,10 @@ class ParameterFinderWarning(LoopyWarning):
 class WriteRaceConditionWarning(LoopyWarning):
     pass
 
+
+class DirectCallUncachedWarning(LoopyWarning):
+    pass
+
 # }}}
 
 
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index b9a0aa27d..c23563c83 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -23,6 +23,7 @@
 import re
 
 from sys import intern
+from immutables import Map
 
 import loopy as lp
 import numpy as np
@@ -325,7 +326,7 @@ def specialize_fortran_division(t_unit):
 
         new_callables[name] = clbl
 
-    return t_unit.copy(callables_table=new_callables)
+    return t_unit.copy(callables_table=Map(new_callables))
 
 # }}}
 
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 72b842dbb..59716edb8 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -52,12 +52,6 @@ def pw_aff_to_aff(pw_aff):
     return pieces[0][1]
 
 
-def dump_space(ls):
-    return " ".join(
-            "%s: %d" % (dim_type.find_value(dt), ls.dim(dt))
-            for dt in range(1 + dim_type.all))
-
-
 # {{{ make_slab
 
 def make_slab(space, iname, start, stop, iname_multiplier=1):
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index fb318bc93..d3b6ec0ea 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -60,6 +60,7 @@
     from loopy.kernel.function_interface import InKernelCallable
     from loopy.codegen import PreambleInfo
 
+
 # {{{ loop kernel object
 
 class KernelState(IntEnum):  # noqa
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index dd182211b..165727e6b 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -654,9 +654,6 @@ class ArrayBase(ImmutableRecord, Taggable):
             * a pymbolic expression
             * :class:`loopy.auto`, in which case an offset argument
               is added automatically, immediately following this argument.
-              :class:`loopy.CompiledKernel` is even smarter in its treatment of
-              this case and will compile custom versions of the kernel based on
-              whether the passed arrays have offsets or not.
 
     .. attribute:: dim_names
 
@@ -844,6 +841,8 @@ def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0,
                     n_axes=num_user_axes,
                     use_increasing_target_axes=self.max_target_axes > 1,
                     dim_names=dim_names)
+
+        if dim_tags is not None:
             order = None
 
         # }}}
@@ -921,7 +920,7 @@ def __eq__(self, other):
                 is_tuple_of_expressions_equal as istoee,
                 is_expression_equal as isee)
         return (
-                type(self) == type(other)
+                type(self) is type(other)
                 and self.name == other.name
                 and self.dtype == other.dtype
                 and istoee(self.shape, other.shape)
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 8dcdbe3c1..4134b11ce 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -27,13 +27,14 @@
 
 from pymbolic.mapper import CSECachingMapperMixin
 from pymbolic.primitives import Slice, Variable, Subscript, Call
+from loopy.kernel.array import FixedStrideArrayDimTag
 from loopy.tools import intern_frozenset_of_ids, Optional
 from loopy.symbolic import (
         IdentityMapper, WalkMapper, SubArrayRef)
 from loopy.kernel.data import (
         InstructionBase,
         MultiAssignmentBase, Assignment,
-        SubstitutionRule, AddressSpace, ValueArg)
+        SubstitutionRule, AddressSpace, ValueArg, auto)
 from loopy.translation_unit import for_each_kernel
 from loopy.diagnostic import LoopyError, warn_with_kernel
 import islpy as isl
@@ -41,7 +42,6 @@
 from pytools import ProcessLogger
 
 from sys import intern
-import loopy.version
 
 import re
 
@@ -992,6 +992,9 @@ def intern_if_str(s):
         subst_match = SUBST_RE.match(insn)
         if subst_match is not None:
             subst = parse_subst_rule(subst_match.groupdict())
+            if subst.name in substitutions:
+                raise LoopyError("attempt to redefine substitution rule "
+                                 f"'{subst.name}'")
             substitutions[subst.name] = subst
             continue
 
@@ -1732,8 +1735,30 @@ def apply_default_order_to_args(kernel, default_order):
 
     processed_args = []
     for arg in kernel.args:
-        if isinstance(arg, ArrayBase) and arg.order is None:
-            arg = arg.copy(order=default_order)
+        if isinstance(arg, ArrayBase):
+            if default_order in ["c", "f", "C", "F"]:
+                if arg.dim_tags is None:
+                    arg = arg.copy(order=default_order)
+                else:
+                    # leave them the way they are
+                    pass
+            elif default_order is auto:
+                if arg.dim_tags is None and arg.shape is not None:
+                    assert arg.shape is not auto
+                    arg = arg.copy(
+                            dim_tags=tuple(
+                                FixedStrideArrayDimTag(auto)
+                                for i in range(len(arg.shape))))
+                    arg = arg.copy(
+                            dim_tags=tuple(
+                                FixedStrideArrayDimTag(auto)
+                                if isinstance(dim_tag, FixedStrideArrayDimTag)
+                                else dim_tag
+                                for dim_tag in arg.dim_tags))
+            else:
+                raise ValueError("unexpected value for default_order: "
+                                 f"'{default_order}'")
+
         processed_args.append(arg)
 
     return kernel.copy(args=processed_args)
@@ -2196,7 +2221,10 @@ def make_function(domains, instructions, kernel_data=None, **kwargs):
     :arg preamble_generators: a list of functions of signature
         (seen_dtypes, seen_functions) where seen_functions is a set of
         (name, c_name, arg_dtypes), generating extra entries for *preambles*.
-    :arg default_order: "C" (default) or "F"
+    :arg default_order: "C" (default), "F" or :class:`loopy.auto`.
+        The default memory layout of arrays that are not explicitly
+        specified. If :class:`loopy.auto`, variables for strides are
+        automatically created.
     :arg default_offset: 0 or :class:`loopy.auto`. The default value of
         *offset* in :attr:`ArrayArg` for guessed arguments.
         Defaults to 0.
@@ -2299,8 +2327,9 @@ def make_function(domains, instructions, kernel_data=None, **kwargs):
 
     from loopy.version import LANGUAGE_VERSION_SYMBOLS
 
+    import loopy.version as v
     version_to_symbol = {
-            getattr(loopy.version, lvs): lvs
+            getattr(v, lvs): lvs
             for lvs in LANGUAGE_VERSION_SYMBOLS}
 
     lang_version = kwargs.pop("lang_version", None)
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 6f46214c7..e4267ab6d 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -39,8 +39,8 @@
 
 from loopy.kernel.array import ArrayBase, ArrayDimImplementationTag
 from loopy.diagnostic import LoopyError
-from loopy.typing import ExpressionT
-from loopy.types import LoopyType
+from loopy.typing import ExpressionT, ShapeType
+from loopy.types import LoopyType, auto
 from loopy.kernel.instruction import (  # noqa
         InstructionBase,
         MemoryOrdering,
@@ -117,13 +117,6 @@ def _names_from_dim_tags(
 # }}}
 
 
-class auto:  # noqa
-    """A generic placeholder object for something that should be automatically
-    determined.  See, for example, the *shape* or *strides* argument of
-    :class:`ArrayArg`.
-    """
-
-
 # {{{ iname tags
 
 def filter_iname_tags_by_type(tags, tag_type, max_num=None, min_num=None):
@@ -261,6 +254,24 @@ def __str__(self):
         return "unr"
 
 
+class UnrollHintTag(InameImplementationTag):
+    __slots__ = ["value"]
+
+    def __init__(self, value=None):
+        ImmutableRecord.__init__(self,
+                value=value)
+
+    @property
+    def key(self):
+        return (type(self).__name__, self.value)
+
+    def __str__(self):
+        if self.value:
+            return f"unr_hint.{self.value}"
+        else:
+            return "unr_hint"
+
+
 class ForceSequentialTag(InameImplementationTag):
     def __str__(self):
         return "forceseq"
@@ -271,12 +282,14 @@ def __str__(self):
         return "ord"
 
 
-def parse_tag(tag):
-    from pytools.tag import Tag as TagBase
+ToInameTagConvertible = Union[str, None, Tag]
+
+
+def parse_tag(tag: ToInameTagConvertible) -> Optional[Tag]:
     if tag is None:
         return tag
 
-    if isinstance(tag, TagBase):
+    if isinstance(tag, Tag):
         return tag
 
     if not isinstance(tag, str):
@@ -294,6 +307,11 @@ def parse_tag(tag):
         return UnrolledIlpTag()
     elif tag == "ilp.seq":
         return LoopedIlpTag()
+    elif tag == "unr_hint":
+        return UnrollHintTag()
+    elif tag.startswith("unr_hint."):
+        offset = len("unr_hint.")
+        return UnrollHintTag(int(tag[offset:]))
     elif tag.startswith("g."):
         return GroupInameTag(int(tag[2:]))
     elif tag.startswith("l."):
@@ -647,8 +665,16 @@ class TemporaryVariable(ArrayBase):
         declaration.
     """
 
-    min_target_axes = 0
-    max_target_axes = 1
+    storage_shape: Optional[ShapeType]
+    base_indices: Optional[Tuple[ExpressionT, ...]]
+    address_space: Union[AddressSpace, Type[auto]]
+    base_storage: Optional[str]
+    initializer: Optional[np.ndarray]
+    read_only: bool
+    _base_storage_access_may_be_aliasing: bool
+
+    min_target_axes: ClassVar[int] = 0
+    max_target_axes: ClassVar[int] = 1
 
     allowed_extra_kwargs = (
             "storage_shape",
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 1a22cbbb9..ed18e2ba4 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -31,6 +31,7 @@
 
 from loopy.diagnostic import LoopyError
 from loopy.tools import Optional
+from collections.abc import Set as abc_Set
 
 
 # {{{ instruction tags
@@ -186,7 +187,7 @@ class InstructionBase(ImmutableRecord, Taggable):
         A :class:`frozenset` of subclasses of :class:`pytools.tag.Tag` used to
         provide metadata on this object. Legacy string tags are converted to
         :class:`LegacyStringInstructionTag` or, if they used to carry
-        a functional meaning, the tag carrying that same fucntional meaning
+        a functional meaning, the tag carrying that same functional meaning
         (e.g. :class:`UseStreamingStoreTag`).
 
     .. automethod:: __init__
@@ -267,7 +268,7 @@ def __init__(self, id, depends_on, depends_on_is_final,
         if depends_on_is_final is None:
             depends_on_is_final = False
 
-        if depends_on_is_final and not isinstance(depends_on, frozenset):
+        if depends_on_is_final and not isinstance(depends_on, abc_Set):
             raise LoopyError("Setting depends_on_is_final to True requires "
                     "actually specifying depends_on")
 
@@ -277,7 +278,7 @@ def __init__(self, id, depends_on, depends_on_is_final,
         if priority is None:
             priority = 0
 
-        if not isinstance(tags, frozenset):
+        if not isinstance(tags, abc_Set):
             # was previously allowed to be tuple
             tags = frozenset(tags)
 
@@ -292,10 +293,10 @@ def __init__(self, id, depends_on, depends_on_is_final,
         # assert all(is_interned(iname) for iname in within_inames)
         # assert all(is_interned(pred) for pred in predicates)
 
-        assert isinstance(within_inames, frozenset)
-        assert isinstance(depends_on, frozenset) or depends_on is None
-        assert isinstance(groups, frozenset)
-        assert isinstance(conflicts_with_groups, frozenset)
+        assert isinstance(within_inames, abc_Set)
+        assert isinstance(depends_on, abc_Set) or depends_on is None
+        assert isinstance(groups, abc_Set)
+        assert isinstance(conflicts_with_groups, abc_Set)
 
         ImmutableRecord.__init__(self,
                 id=id,
@@ -605,7 +606,7 @@ def update_persistent_hash(self, key_hash, key_builder):
         key_builder.rec(key_hash, self.var_name)
 
     def __eq__(self, other):
-        return (type(self) == type(other)
+        return (type(self) is type(other)
                 and self.var_name == other.var_name)
 
     def __ne__(self, other):
@@ -1062,7 +1063,8 @@ def __str__(self):
             result += " {%s}" % (": ".join(options))
 
         if self.predicates:
-            result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates)
+            result += "\n" + 10*" " + "if (%s)" % " && ".join(
+                    str(pred) for pred in self.predicates)
         return result
 
     def arg_id_to_arg(self):
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 99f5f3503..a86173fdc 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -2115,4 +2115,19 @@ def get_outer_params(domains):
 # }}}
 
 
+def get_hw_axis_base_for_codegen(kernel: LoopKernel, iname: str) -> isl.Aff:
+    """
+    Returns a :class:`isl.PwAff` hardware axes lower bound to serve as an
+    offsetting expression
+    during the hardware ina
+    """
+    from loopy.kernel.data import HardwareConcurrentTag
+    from loopy.isl_helpers import static_min_of_pw_aff
+
+    assert kernel.iname_tags_of_type(iname, HardwareConcurrentTag)
+    bounds = kernel.get_iname_bounds(iname)
+    lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff,
+                                       constants_only=False)
+    return lower_bound
+
 # vim: foldmethod=marker
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 190e6bd9a..a4952d776 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -86,6 +86,9 @@ def __call__(self, dtype, operand1, operand2):
     def __ne__(self, other):
         return not self.__eq__(other)
 
+    def __repr__(self) -> str:
+        return type(self).__name__
+
     @staticmethod
     def parse_result_type(target, op_type):
         try:
@@ -118,7 +121,7 @@ def __hash__(self):
         return hash((type(self),))
 
     def __eq__(self, other):
-        return type(self) == type(other)
+        return type(self) is type(other)
 
     def __str__(self):
         result = type(self).__name__.replace("ReductionOperation", "").lower()
@@ -359,7 +362,7 @@ def __hash__(self):
         return hash(type(self))
 
     def __eq__(self, other):
-        return type(self) == type(other) and (self.inner_reduction ==
+        return type(self) is type(other) and (self.inner_reduction ==
                 other.inner_reduction)
 
     def __call__(self, dtypes, operand1, operand2, callables_table, target):
@@ -461,7 +464,7 @@ def __hash__(self):
         return hash(type(self))
 
     def __eq__(self, other):
-        return type(self) == type(other)
+        return type(self) is type(other)
 
     @property
     def arg_count(self):
diff --git a/loopy/match.py b/loopy/match.py
index 624276dce..423c4ecec 100644
--- a/loopy/match.py
+++ b/loopy/match.py
@@ -24,8 +24,14 @@
 THE SOFTWARE.
 """
 
+from abc import abstractmethod, ABC
+from dataclasses import dataclass
+from typing import FrozenSet, List, Sequence, Tuple, Union, Protocol
 from sys import intern
 
+from loopy.kernel import LoopKernel
+from loopy.kernel.instruction import InstructionBase
+
 
 NoneType = type(None)
 
@@ -33,6 +39,10 @@
 import pytools.tag
 
 __doc__ = """
+.. autoclass:: Matchable
+.. autoclass:: StackMatchComponent
+.. autoclass:: StackMatch
+
 .. autofunction:: parse_match
 
 .. autofunction:: parse_stack_match
@@ -117,8 +127,18 @@ def re_from_glob(s):
 
 # {{{ match expression
 
-class MatchExpressionBase:
-    def __call__(self, kernel, matchable):
+class Matchable(Protocol):
+    """
+    .. attribute:: tags
+    """
+    @property
+    def tags(self) -> FrozenSet[pytools.tag.Tag]:
+        ...
+
+
+class MatchExpressionBase(ABC):
+    @abstractmethod
+    def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool:
         raise NotImplementedError
 
     def __ne__(self, other):
@@ -135,7 +155,7 @@ def __inv__(self):
 
 
 class All(MatchExpressionBase):
-    def __call__(self, kernel, matchable):
+    def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool:
         return True
 
     def __str__(self):
@@ -148,15 +168,15 @@ def update_persistent_hash(self, key_hash, key_builder):
         key_builder.rec(key_hash, "all_match_expr")
 
     def __eq__(self, other):
-        return (type(self) == type(other))
+        return type(self) is type(other)
 
     def __hash__(self):
         return hash(type(self))
 
 
+@dataclass(frozen=True, eq=True)
 class MultiChildMatchExpressionBase(MatchExpressionBase):
-    def __init__(self, children):
-        self.children = children
+    children: Sequence[MatchExpressionBase]
 
     def __str__(self):
         joiner = " %s " % type(self).__name__.lower()
@@ -167,33 +187,22 @@ def __repr__(self):
                 type(self).__name__,
                 ", ".join(repr(ch) for ch in self.children))
 
-    def update_persistent_hash(self, key_hash, key_builder):
-        key_builder.rec(key_hash, type(self).__name__)
-        key_builder.rec(key_hash, self.children)
-
-    def __eq__(self, other):
-        return (type(self) == type(other)
-                and self.children == other.children)
-
-    def __hash__(self):
-        return hash((type(self), self.children))
-
 
 class And(MultiChildMatchExpressionBase):
-    def __call__(self, kernel, matchable):
+    def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool:
         return all(ch(kernel, matchable) for ch in self.children)
 
 
 class Or(MultiChildMatchExpressionBase):
-    def __call__(self, kernel, matchable):
+    def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool:
         return any(ch(kernel, matchable) for ch in self.children)
 
 
+@dataclass(frozen=True, eq=True)
 class Not(MatchExpressionBase):
-    def __init__(self, child):
-        self.child = child
+    child: MatchExpressionBase
 
-    def __call__(self, kernel, matchable):
+    def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool:
         return not self.child(kernel, matchable)
 
     def __str__(self):
@@ -202,18 +211,8 @@ def __str__(self):
     def __repr__(self):
         return "{}({!r})".format(type(self).__name__, self.child)
 
-    def update_persistent_hash(self, key_hash, key_builder):
-        key_builder.rec(key_hash, "not_match_expr")
-        key_builder.rec(key_hash, self.child)
-
-    def __eq__(self, other):
-        return (type(self) == type(other)
-                and self.child == other.child)
-
-    def __hash__(self):
-        return hash((type(self), self.child))
-
 
+@dataclass(frozen=True, eq=True)
 class ObjTagged(MatchExpressionBase):
     """Match if the object is tagged with a given :class:`~pytools.tag.Tag`.
 
@@ -222,19 +221,14 @@ class ObjTagged(MatchExpressionBase):
         These instance-based tags will, in the not-too-distant future, replace
         the string-based tags matched by :class:`Tagged`.
     """
-    def __init__(self, tag: pytools.tag.Tag):
-        self.tag = tag
+    tag: pytools.tag.Tag
 
-    def __call__(self, kernel, matchable):
+    def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool:
         return self.tag in matchable.tags
 
-    def update_persistent_hash(self, key_hash, key_builder):
-        key_builder.rec(key_hash, type(self).__name__)
-        key_builder.rec(key_hash, self.tag)
-
 
 class GlobMatchExpressionBase(MatchExpressionBase):
-    def __init__(self, glob):
+    def __init__(self, glob: str) -> None:
         self.glob = glob
 
         import re
@@ -253,7 +247,7 @@ def update_persistent_hash(self, key_hash, key_builder):
         key_builder.rec(key_hash, self.glob)
 
     def __eq__(self, other):
-        return (type(self) == type(other)
+        return (type(self) is type(other)
                 and self.glob == other.glob)
 
     def __hash__(self):
@@ -273,7 +267,8 @@ class Tagged(GlobMatchExpressionBase):
         These string-based tags will, in the not-too-distant future, be replace
         by instance-based tags matched by :class:`ObjTagged`.
     """
-    def __call__(self, kernel, matchable):
+
+    def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool:
         from loopy.kernel.instruction import LegacyStringInstructionTag
         if matchable.tags:
             return any(
@@ -289,13 +284,17 @@ def __call__(self, kernel, matchable):
 
 
 class Writes(GlobMatchExpressionBase):
-    def __call__(self, kernel, matchable):
+    def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool:
+        if not isinstance(matchable, InstructionBase):
+            return False
         return any(self.re.match(name)
                 for name in matchable.assignee_var_names())
 
 
 class Reads(GlobMatchExpressionBase):
-    def __call__(self, kernel, matchable):
+    def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool:
+        if not isinstance(matchable, InstructionBase):
+            return False
         return any(self.re.match(name)
                 for name in matchable.read_dependency_names())
 
@@ -306,7 +305,10 @@ def __call__(self, kernel, matchable):
 
 
 class Iname(GlobMatchExpressionBase):
-    def __call__(self, kernel, matchable):
+    def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool:
+        if not isinstance(matchable, InstructionBase):
+            return False
+
         return any(self.re.match(name)
                 for name in matchable.within_inames)
 
@@ -421,39 +423,47 @@ def inner_parse(pstate, min_precedence=0):
 
 # {{{ stack match objects
 
-class StackMatchComponent:
+class StackMatchComponent(ABC):
+    """
+    .. automethod:: __call__
+    """
+
+    @abstractmethod
+    def __call__(self, kernel: LoopKernel, stack: Sequence[Matchable]) -> bool:
+        pass
+
     def __ne__(self, other):
         return not self.__eq__(other)
 
 
 class StackAllMatchComponent(StackMatchComponent):
-    def __call__(self, kernel, stack):
+    def __call__(self, kernel: LoopKernel, stack: Sequence[Matchable]) -> bool:
         return True
 
     def update_persistent_hash(self, key_hash, key_builder):
         key_builder.rec(key_hash, "all_match")
 
     def __eq__(self, other):
-        return (type(self) == type(other))
+        return type(self) is type(other)
 
 
 class StackBottomMatchComponent(StackMatchComponent):
-    def __call__(self, kernel, stack):
+    def __call__(self, kernel: LoopKernel, stack: Sequence[Matchable]) -> bool:
         return not stack
 
     def update_persistent_hash(self, key_hash, key_builder):
         key_builder.rec(key_hash, "bottom_match")
 
     def __eq__(self, other):
-        return (type(self) == type(other))
+        return type(self) is type(other)
 
 
+@dataclass(eq=True, frozen=True)
 class StackItemMatchComponent(StackMatchComponent):
-    def __init__(self, match_expr, inner_match):
-        self.match_expr = match_expr
-        self.inner_match = inner_match
+    match_expr: MatchExpressionBase
+    inner_match: StackMatchComponent
 
-    def __call__(self, kernel, stack):
+    def __call__(self, kernel: LoopKernel, stack: Sequence[Matchable]) -> bool:
         if not stack:
             return False
 
@@ -463,22 +473,12 @@ def __call__(self, kernel, stack):
 
         return self.inner_match(kernel, stack[1:])
 
-    def update_persistent_hash(self, key_hash, key_builder):
-        key_builder.rec(key_hash, "item_match")
-        key_builder.rec(key_hash, self.match_expr)
-        key_builder.rec(key_hash, self.inner_match)
-
-    def __eq__(self, other):
-        return (type(self) == type(other)
-                and self.match_expr == other.match_expr
-                and self.inner_match == other.inner_match)
-
 
+@dataclass(eq=True, frozen=True)
 class StackWildcardMatchComponent(StackMatchComponent):
-    def __init__(self, inner_match):
-        self.inner_match = inner_match
+    inner_match: StackMatchComponent
 
-    def __call__(self, kernel, stack):
+    def __call__(self, kernel: LoopKernel, stack: Sequence[Matchable]) -> bool:
         for i in range(0, len(stack)):
             if self.inner_match(kernel, stack[i:]):
                 return True
@@ -490,10 +490,10 @@ def __call__(self, kernel, stack):
 
 # {{{ stack matcher
 
+@dataclass(eq=True, frozen=True)
 class RuleInvocationMatchable:
-    def __init__(self, id, tags):
-        self.id = id
-        self.tags = tags
+    id: str
+    tags: FrozenSet[pytools.tag.Tag]
 
     def write_dependency_names(self):
         raise TypeError("writes: query may not be applied to rule invocations")
@@ -505,27 +505,21 @@ def inames(self, kernel):
         raise TypeError("inames: query may not be applied to rule invocations")
 
 
+@dataclass(eq=True, frozen=True)
 class StackMatch:
-    def __init__(self, root_component):
-        self.root_component = root_component
-
-    def update_persistent_hash(self, key_hash, key_builder):
-        key_builder.rec(key_hash, self.root_component)
-
-    def __eq__(self, other):
-        return (
-                type(self) == type(other)
-                and
-                self.root_component == other.root_component)
+    """
+    .. automethod:: __call__
+    """
 
-    def __ne__(self, other):
-        return not self.__eq__(other)
+    root_component: StackMatchComponent
 
-    def __call__(self, kernel, insn, rule_stack):
+    def __call__(
+            self, kernel: LoopKernel, insn: InstructionBase,
+            rule_stack: Sequence[Tuple[str, FrozenSet[pytools.tag.Tag]]]) -> bool:
         """
         :arg rule_stack: a tuple of (name, tags) rule invocation, outermost first
         """
-        stack_of_matchables = [insn]
+        stack_of_matchables: List[Matchable] = [insn]
         for id, tags in rule_stack:
             stack_of_matchables.append(RuleInvocationMatchable(id, tags))
 
@@ -536,7 +530,10 @@ def __call__(self, kernel, insn, rule_stack):
 
 # {{{ stack match parsing
 
-def parse_stack_match(smatch):
+ToStackMatchCovertible = Union[StackMatch, str, None]
+
+
+def parse_stack_match(smatch: ToStackMatchCovertible) -> StackMatch:
     """Syntax example::
 
         ... > outer > ... > next > innermost $
@@ -561,7 +558,7 @@ def parse_stack_match(smatch):
 
     smatch = smatch.strip()
 
-    match = StackAllMatchComponent()
+    match: StackMatchComponent = StackAllMatchComponent()
     if smatch[-1] == "$":
         match = StackBottomMatchComponent()
         smatch = smatch[:-1]
diff --git a/loopy/options.py b/loopy/options.py
index 4763252bc..64667463a 100644
--- a/loopy/options.py
+++ b/loopy/options.py
@@ -131,7 +131,7 @@ class Options(ImmutableRecord):
         output values. This is helpful if arguments are inferred
         and argument ordering is thus implementation-defined.
 
-        See :meth:`CompiledKernel.__call__`.
+        See :meth:`ExecutorBase.__call__`.
 
     .. attribute:: write_wrapper
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 6c9456661..a84ac4359 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -46,7 +46,7 @@
 from loopy.transform.data import allocate_temporaries_for_base_storage
 from loopy.kernel.array import ArrayDimImplementationTag
 from loopy.kernel.data import _ArraySeparationInfo, KernelArgument
-from loopy.translation_unit import for_each_kernel
+from loopy.translation_unit import TranslationUnit, for_each_kernel
 from loopy.typing import ExpressionT
 
 from pytools import ProcessLogger
@@ -724,7 +724,7 @@ def filter_reachable_callables(t_unit):
                                                                  t_unit.entrypoints)
     new_callables = {name: clbl for name, clbl in t_unit.callables_table.items()
                      if name in (reachable_function_ids | t_unit.entrypoints)}
-    return t_unit.copy(callables_table=new_callables)
+    return t_unit.copy(callables_table=Map(new_callables))
 
 
 def _preprocess_single_kernel(kernel: LoopKernel, is_entrypoint: bool) -> LoopKernel:
@@ -788,33 +788,33 @@ def _preprocess_single_kernel(kernel: LoopKernel, is_entrypoint: bool) -> LoopKe
 
 
 @memoize_on_disk
-def preprocess_program(program):
+def preprocess_program(t_unit: TranslationUnit) -> TranslationUnit:
 
     from loopy.kernel import KernelState
-    if program.state >= KernelState.PREPROCESSED:
-        return program
+    if t_unit.state >= KernelState.PREPROCESSED:
+        return t_unit
 
-    if len([clbl for clbl in program.callables_table.values() if
+    if len([clbl for clbl in t_unit.callables_table.values() if
             isinstance(clbl, CallableKernel)]) == 1:
-        program = program.with_entrypoints(",".join(clbl.name for clbl in
-            program.callables_table.values() if isinstance(clbl,
+        t_unit = t_unit.with_entrypoints(",".join(clbl.name for clbl in
+            t_unit.callables_table.values() if isinstance(clbl,
                 CallableKernel)))
 
-    if not program.entrypoints:
+    if not t_unit.entrypoints:
         raise LoopyError("Translation unit did not receive any entrypoints")
 
     from loopy.translation_unit import resolve_callables
-    program = resolve_callables(program)
+    t_unit = resolve_callables(t_unit)
 
-    program = filter_reachable_callables(program)
+    t_unit = filter_reachable_callables(t_unit)
 
-    program = infer_unknown_types(program, expect_completion=False)
+    t_unit = infer_unknown_types(t_unit, expect_completion=False)
 
     from loopy.transform.subst import expand_subst
-    program = expand_subst(program)
+    t_unit = expand_subst(t_unit)
 
     from loopy.kernel.creation import apply_single_writer_depencency_heuristic
-    program = apply_single_writer_depencency_heuristic(program)
+    t_unit = apply_single_writer_depencency_heuristic(t_unit)
 
     # Ordering restrictions:
     #
@@ -826,7 +826,7 @@ def preprocess_program(program):
     #   defaults from being applied.
 
     from loopy.transform.realize_reduction import realize_reduction
-    program = realize_reduction(program, unknown_types_ok=False)
+    t_unit = realize_reduction(t_unit, unknown_types_ok=False)
 
     # {{{ preprocess callable kernels
 
@@ -838,11 +838,11 @@ def preprocess_program(program):
     # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects
 
     new_callables = {}
-    for func_id, in_knl_callable in program.callables_table.items():
+    for func_id, in_knl_callable in t_unit.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             new_subkernel = _preprocess_single_kernel(
                     in_knl_callable.subkernel,
-                    is_entrypoint=func_id in program.entrypoints)
+                    is_entrypoint=func_id in t_unit.entrypoints)
             in_knl_callable = in_knl_callable.copy(
                     subkernel=new_subkernel)
         elif isinstance(in_knl_callable, ScalarCallable):
@@ -853,16 +853,16 @@ def preprocess_program(program):
 
         new_callables[func_id] = in_knl_callable
 
-    program = program.copy(callables_table=new_callables)
+    t_unit = t_unit.copy(callables_table=Map(new_callables))
 
     # }}}
 
     # infer arg descrs of the callables
-    program = infer_arg_descr(program)
+    t_unit = infer_arg_descr(t_unit)
 
     # Ordering restriction:
     # callees with gbarrier in them must be inlined after inferrring arg_descr.
-    program = inline_kernels_with_gbarriers(program)
+    t_unit = inline_kernels_with_gbarriers(t_unit)
 
     # {{{ prepare for caching
 
@@ -873,7 +873,7 @@ def preprocess_program(program):
 
     # }}}
 
-    return program
+    return t_unit
 
 
 # FIXME: Do we add a deprecation warning?
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 4aaa08080..f80aa6f37 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -28,6 +28,7 @@
 from typing import (FrozenSet, Hashable, Sequence, AbstractSet, Any, Set, TypeVar,
                     Mapping, Dict, Tuple, Iterator, Optional, TYPE_CHECKING)
 
+from immutables import Map
 from pytools import ImmutableRecord
 import islpy as isl
 from loopy.diagnostic import LoopyError, ScheduleDebugInputError, warn_with_kernel
@@ -36,7 +37,7 @@
 
 from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.kernel.instruction import InstructionBase
-from loopy.tools import LoopyKeyBuilder
+from loopy.tools import LoopyKeyBuilder, caches
 from loopy.version import DATA_MODEL_VERSION
 
 if TYPE_CHECKING:
@@ -2202,6 +2203,9 @@ def print_longest_dead_end():
         key_builder=LoopyKeyBuilder())
 
 
+caches.append(schedule_cache)
+
+
 def _get_one_linearized_kernel_inner(kernel, callables_table):
     # This helper function exists to ensure that the generator chain is fully
     # out of scope after the function returns. This allows it to be
@@ -2275,7 +2279,7 @@ def linearize(t_unit):
         else:
             raise NotImplementedError(type(clbl))
 
-    return t_unit.copy(callables_table=new_callables)
+    return t_unit.copy(callables_table=Map(new_callables))
 
 
 # vim: foldmethod=marker
diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py
index f2164b6dd..fd7d46876 100644
--- a/loopy/schedule/tools.py
+++ b/loopy/schedule/tools.py
@@ -335,20 +335,28 @@ def get_return_from_kernel_mapping(kernel):
 
 # {{{ check for write races in accesses
 
-def _check_for_access_races(map_a, insn_a, map_b, insn_b, knl, callables_table):
+def _check_for_access_races(map_a, insn_a, map_b, insn_b, knl, callables_table,
+                            address_space):
     """
     Returns *True* if the execution instances of *insn_a* and *insn_b*, accessing
     the same variable via access maps *map_a* and *map_b*, result in an access race.
 
+    :arg address_space: An instance of :class:`loopy.kernel.data.AddressSpace`
+        of the variable whose accesses are being checked for a race.
+
     .. note::
 
         The accesses ``map_a``, ``map_b`` lead to write races iff there exists 2
         *unequal* global ids that access the same address.
     """
     import pymbolic.primitives as p
-    from loopy.symbolic import isl_set_from_expr
+    from loopy.symbolic import isl_set_from_expr, aff_from_expr, aff_to_expr
     from loopy.kernel.data import (filter_iname_tags_by_type,
-                                   HardwareConcurrentTag)
+                                   HardwareConcurrentTag,
+                                   AddressSpace)
+    from loopy.kernel.tools import get_hw_axis_base_for_codegen
+
+    assert address_space in [AddressSpace.LOCAL, AddressSpace.GLOBAL]
 
     gsize, lsize = knl.get_grid_size_upper_bounds(callables_table,
                                                   return_dict=True)
@@ -357,9 +365,10 @@ def _check_for_access_races(map_a, insn_a, map_b, insn_b, knl, callables_table):
 
     # Step 1.1: Project out inames which are also map's dims, but does not form the
     #           insn's within_inames
-    # Step 1.2: Project out sequential inames in the access maps
-    # Step 1.3: Rename the dims with their iname tags i.e. (g.i or l.i)
-    # Step 1.4: Name the ith output dims as _lp_dim{i}
+    # Step 1.2: Perform any offsetting required to the hw axes iname terms
+    # Step 1.3: Project out sequential inames in the access maps
+    # Step 1.4: Rename the dims with their iname tags i.e. (g.i or l.i)
+    # Step 1.5: Name the ith output dims as _lp_dim{i}
 
     updated_maps = []
 
@@ -381,6 +390,36 @@ def _check_for_access_races(map_a, insn_a, map_b, insn_b, knl, callables_table):
             if dt == isl.dim_type.in_:
                 tag, = filter_iname_tags_by_type(knl.inames[name].tags,
                                                  HardwareConcurrentTag)
+
+                iname_lower_bound = get_hw_axis_base_for_codegen(knl, name)
+
+                if not iname_lower_bound.plain_is_zero():
+                    # Hardware inames with nonzero base have an offset applied in
+                    # code generation:
+                    # https://github.com/inducer/loopy/blob/4e0b1c7635afe1473c8636377f8e7ef6d78dfd46/loopy/codegen/loop.py#L293-L297
+                    # https://github.com/inducer/loopy/issues/600#issuecomment-1104066735
+
+                    map_ = map_.add_dims(isl.dim_type.out, 1)
+                    map_ = map_.move_dims(
+                        isl.dim_type.in_, pos+1,
+                        isl.dim_type.out, map_.dim(isl.dim_type.out)-1,
+                        1
+                    )
+                    map_ = map_.set_dim_name(isl.dim_type.in_, pos+1, name+"'")
+
+                    lbound_offset_expr_aff = aff_from_expr(
+                        map_.domain().space,
+                        (p.Variable(name+"'")
+                         + aff_to_expr(iname_lower_bound)
+                         - p.Variable(name))
+                    )
+                    lbound_offset_as_domain = lbound_offset_expr_aff.zero_basic_set()
+                    map_ = map_.intersect_domain(lbound_offset_as_domain)
+
+                    map_ = map_.project_out(dt, pos, 1)
+                    assert map_.get_dim_name(dt, pos) == name+"'"
+                    map_ = map_.set_dim_name(dt, pos, name)
+
                 map_ = map_.set_dim_name(dt, pos, str(tag))
 
         for i_l in lsize:
@@ -438,25 +477,40 @@ def _check_for_access_races(map_a, insn_a, map_b, insn_b, knl, callables_table):
     # {{{ Step 5: create the set any(l.i.A != l.i.B) OR any(g.i.A != g.i.B)
 
     space = set_a.space
-    unequal_global_id_set = isl.Set.empty(set_a.get_space())
+    unequal_local_id_set = isl.Set.empty(set_a.get_space())
+    unequal_group_id_set = isl.Set.empty(set_a.get_space())
+    equal_group_id_set = isl.BasicSet.universe(set_a.get_space())
 
     for i_l in lsize:
         lid_a = p.Variable(f"l.{i_l}.A")
         lid_b = p.Variable(f"l.{i_l}.B")
-        unequal_global_id_set |= (isl_set_from_expr(space,
-                                                    p.Comparison(lid_a, "!=", lid_b))
-                                  )
+        unequal_local_id_set |= (isl_set_from_expr(space,
+                                                   p.Comparison(lid_a, "!=", lid_b))
+                                 )
 
     for i_g in gsize:
         gid_a = p.Variable(f"g.{i_g}.A")
         gid_b = p.Variable(f"g.{i_g}.B")
-        unequal_global_id_set |= (isl_set_from_expr(space,
-                                                    p.Comparison(gid_a, "!=", gid_b))
-                                  )
+        unequal_group_id_set |= (isl_set_from_expr(space,
+                                                   p.Comparison(gid_a, "!=", gid_b))
+                                 )
+        equal_group_id_set &= (isl_set_from_expr(space,
+                                                 p.Comparison(gid_a, "==", gid_b))
+                               )
 
     # }}}
 
-    return not (set_a & set_b & unequal_global_id_set).is_empty()
+    if address_space == AddressSpace.GLOBAL:
+        return not (set_a
+                    & set_b
+                    & (unequal_local_id_set
+                       | unequal_group_id_set)
+                    ).is_empty()
+    else:
+        return not (set_a
+                    & set_b
+                    & unequal_local_id_set
+                    & equal_group_id_set).is_empty()
 
 
 class AccessMapDescriptor(enum.Enum):
@@ -550,7 +604,10 @@ def do_accesses_result_in_races(self, insn1, insn1_dir, insn2, insn2_dir,
 
         return _check_for_access_races(insn1_amap, self.kernel.id_to_insn[insn1],
                                        insn2_amap, self.kernel.id_to_insn[insn2],
-                                       self.kernel, self.callables_table)
+                                       self.kernel, self.callables_table,
+                                       (self.kernel
+                                        .get_var_descriptor(var_name)
+                                        .address_space))
 
 # }}}
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index fd6013416..99cc56571 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -24,7 +24,7 @@
 """
 
 
-from typing import ClassVar, Tuple
+from typing import AbstractSet, ClassVar, Mapping, Sequence, Tuple
 from functools import reduce, cached_property
 from sys import intern
 import re
@@ -69,6 +69,7 @@
 from loopy.diagnostic import LoopyError
 from loopy.diagnostic import (ExpressionToAffineConversionError,
                               UnableToDetermineAccessRangeError)
+from loopy.typing import ExpressionT
 
 
 __doc__ = """
@@ -604,7 +605,7 @@ class TypedCSE(LoopyExpressionBase, p.CommonSubexpression):
     """
 
     def __init__(self, child, prefix=None, dtype=None):
-        super().__init__(child, prefix)
+        super().__init__(child, prefix=prefix, scope=p.cse_scope.EVALUATION)
         self.dtype = dtype
 
     def __getinitargs__(self):
@@ -1042,11 +1043,11 @@ def _get_dependencies_and_reduction_inames(expr):
     return deps, reduction_inames
 
 
-def get_dependencies(expr):
+def get_dependencies(expr: ExpressionT) -> AbstractSet[str]:
     return _get_dependencies_and_reduction_inames(expr)[0]
 
 
-def get_reduction_inames(expr):
+def get_reduction_inames(expr: ExpressionT) -> AbstractSet[str]:
     return _get_dependencies_and_reduction_inames(expr)[1]
 
 
@@ -1329,7 +1330,12 @@ def map_call(self, expr, expn_state, *args, **kwargs):
                                          *args, **kwargs)
 
     @staticmethod
-    def make_new_arg_context(rule_name, arg_names, arguments, arg_context):
+    def make_new_arg_context(
+            rule_name: str,
+            arg_names: Sequence[str],
+            arguments: Sequence[ExpressionT],
+            arg_context: Mapping[str, ExpressionT]
+            ) -> Mapping[str, ExpressionT]:
         if len(arg_names) != len(arguments):
             raise RuntimeError("Rule '%s' invoked with %d arguments (needs %d)"
                     % (rule_name, len(arguments), len(arg_names), ))
@@ -1577,7 +1583,8 @@ def map_call(self, expr):
                     tag = None
 
                 return p.CommonSubexpression(
-                        self.rec(expr.parameters[0]), tag)
+                        self.rec(expr.parameters[0]), tag,
+                        scope=p.cse_scope.EVALUATION)
             else:
                 raise TypeError("cse takes two arguments")
 
@@ -1601,6 +1608,16 @@ def map_call(self, expr):
             else:
                 raise TypeError("if takes three arguments")
 
+        elif name in ["minimum", "maximum"]:
+            if len(expr.parameters) == 2:
+                from pymbolic.primitives import Min, Max
+                return {
+                    "minimum": Min,
+                    "maximum": Max
+                }[name](tuple(self.rec(p) for p in expr.parameters))
+            else:
+                raise TypeError(f"{name} takes two arguments")
+
         else:
             # see if 'name' is an existing reduction op
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 54a0729da..be04d1008 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -23,7 +23,6 @@
 
 from __future__ import annotations
 
-
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
 
 __license__ = """
@@ -54,30 +53,42 @@
     from loopy.typing import ExpressionT
     from loopy.codegen import CodeGenerationState
     from loopy.codegen.result import CodeGenerationResult
+    from loopy.target.execution import ExecutorBase
+    from loopy.translation_unit import TranslationUnit, FunctionIdT
 
 
 ASTType = TypeVar("ASTType")
 
 
-class TargetBase():
+class TargetBase:
     """Base class for all targets, i.e. different combinations of code that
     loopy can generate.
 
     Objects of this type must be picklable.
     """
 
-    # {{{ persistent hashing
+    # {{{ hashing/equality
 
     hash_fields: ClassVar[Tuple[str, ...]] = ()
     comparison_fields: ClassVar[Tuple[str, ...]] = ()
 
+    def __hash__(self):
+        # NOTE: _hash_value may vanish during pickling
+        if getattr(self, "_hash_value", None) is None:
+            from loopy.tools import LoopyKeyBuilder
+            key_hash = LoopyKeyBuilder.new_hash()
+            LoopyKeyBuilder()(self)
+            object.__setattr__(self, "_hash_value", hash(key_hash.digest()))
+
+        return self._hash_value  # pylint: disable=no-member
+
     def update_persistent_hash(self, key_hash, key_builder):
         key_hash.update(type(self).__name__.encode())
         for field_name in self.hash_fields:
             key_builder.rec(key_hash, getattr(self, field_name))
 
     def __eq__(self, other):
-        if type(self) != type(other):
+        if type(self) is not type(other):
             return False
 
         for field_name in self.comparison_fields:
@@ -152,7 +163,9 @@ def get_kernel_executor_cache_key(self, *args, **kwargs):
         """
         raise NotImplementedError()
 
-    def get_kernel_executor(self, kernel, *args, **kwargs):
+    def get_kernel_executor(
+            self, t_unit: TranslationUnit, *args, entrypoint: FunctionIdT,
+            **kwargs) -> ExecutorBase:
         """
         :returns: an immutable type to be used as the cache key for
             kernel executor caching.
@@ -164,7 +177,7 @@ class ASTBuilderBase(Generic[ASTType]):
     """An interface for generating (host or device) ASTs.
     """
 
-    def __init__(self, target):
+    def __init__(self, target) -> None:
         self.target = target
 
     # {{{ library
@@ -249,7 +262,10 @@ def emit_multiple_assignment(self, codegen_state, insn):
         raise NotImplementedError()
 
     def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
-            static_lbound, static_ubound, inner):
+            static_lbound, static_ubound, inner, hints):
+        raise NotImplementedError()
+
+    def emit_unroll_hint(self, value):
         raise NotImplementedError()
 
     @property
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index acccbbf38..06dc2f099 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -23,12 +23,13 @@
 THE SOFTWARE.
 """
 
-from typing import cast, Tuple, Optional, Sequence
+from typing import cast, Tuple, Optional, Sequence, Any
 import re
 
 import numpy as np  # noqa
 
-from cgen import Pointer, NestedDeclarator, Block, Generable, Declarator, Const
+from cgen import (Collection, Pointer, NestedDeclarator, Block, Generable,
+                  Declarator, Const)
 from cgen.mapper import IdentityMapper as CASTIdentityMapperBase
 from pymbolic.mapper.stringifier import PREC_NONE
 import pymbolic.primitives as p
@@ -37,6 +38,8 @@
 from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder
 from loopy.diagnostic import LoopyError, LoopyTypeError
 from loopy.symbolic import IdentityMapper
+from loopy.target.execution import ExecutorBase
+from loopy.translation_unit import FunctionIdT, TranslationUnit
 from loopy.types import NumpyType, LoopyType
 from loopy.typing import ExpressionT
 from loopy.kernel import LoopKernel
@@ -244,12 +247,9 @@ def _preamble_generator(preamble_info, func_qualifier="inline"):
               {res_ctype} y = 1;
 
               while (n > 1) {{
-                if (n % 2) {{
+                if (n % 2)
                   y = x * y;
-                  x = x * x;
-                }}
-                else
-                  x = x * x;
+                x = x * x;
                 n = n / 2;
               }}
 
@@ -438,12 +438,6 @@ def dtype_to_typename(self, dtype):
         # These kind of shouldn't be here.
         return self.get_dtype_registry().dtype_to_ctype(dtype)
 
-    def get_kernel_executor_cache_key(self, *args, **kwargs):
-        raise NotImplementedError
-
-    def get_kernel_executor(self, knl, *args, **kwargs):
-        raise NotImplementedError
-
     # }}}
 
 
@@ -784,9 +778,6 @@ def get_function_definition(
 
         from cgen import (
                 FunctionBody,
-
-                # Post-mid-2016 cgens have 'Collection', too.
-                Module as Collection,
                 Initializer,
                 Line)
 
@@ -1092,6 +1083,7 @@ def get_temporary_var_declarator(self,
         if temp_var.storage_shape:
             shape = temp_var.storage_shape
         else:
+            assert isinstance(temp_var.shape, tuple)
             shape = temp_var.shape
 
         assert isinstance(shape, tuple)
@@ -1111,6 +1103,7 @@ def get_temporary_var_declarator(self,
             from cgen import AlignedAttribute
             temp_var_decl = AlignedAttribute(temp_var.alignment, temp_var_decl)
 
+        assert isinstance(temp_var.address_space, AddressSpace)
         return self.wrap_decl_for_address_space(temp_var_decl,
                 temp_var.address_space)
 
@@ -1224,7 +1217,7 @@ def emit_multiple_assignment(self, codegen_state, insn):
                                 in_knl_callable_as_call))
 
     def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
-            lbound, ubound, inner):
+            lbound, ubound, inner, hints):
         ecm = codegen_state.expression_to_code_mapper
 
         from pymbolic import var
@@ -1232,7 +1225,7 @@ def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
         from pymbolic.mapper.stringifier import PREC_NONE
         from cgen import For, InlineInitializer
 
-        return For(
+        loop = For(
                 InlineInitializer(
                     POD(self, iname_dtype, iname),
                     ecm(lbound, PREC_NONE, "i")),
@@ -1245,6 +1238,18 @@ def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
                 "++%s" % iname,
                 inner)
 
+        if hints:
+            return Collection(list(hints) + [loop])
+        else:
+            return loop
+
+    def emit_unroll_hint(self, value):
+        from cgen import Pragma
+        if value:
+            return Pragma(f"unroll {value}")
+        else:
+            return Pragma("unroll")
+
     def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
         decl = POD(self, dtype, name)
 
@@ -1378,10 +1383,11 @@ def get_kernel_executor_cache_key(self, *args, **kwargs):
         # and None isn't allowed in that setting.
         return _CExecutorCacheKey
 
-    def get_kernel_executor(self, t_unit, *args, **kwargs):
-        from loopy.target.c.c_execution import CKernelExecutor
-        return CKernelExecutor(t_unit, entrypoint=kwargs.pop("entrypoint"),
-                compiler=self.compiler)
+    def get_kernel_executor(
+            self, t_unit: TranslationUnit,
+            *args: Any, entrypoint: FunctionIdT, **kwargs: Any) -> ExecutorBase:
+        from loopy.target.c.c_execution import CExecutor
+        return CExecutor(t_unit, entrypoint=entrypoint, compiler=self.compiler)
 
     def get_host_ast_builder(self):
         # enable host code generation
diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py
index 557c67f03..b1685cad1 100644
--- a/loopy/target/c/c_execution.py
+++ b/loopy/target/c/c_execution.py
@@ -43,7 +43,7 @@
 from loopy.schedule.tools import KernelArgInfo
 from loopy.codegen.result import GeneratedProgram
 from loopy.translation_unit import TranslationUnit
-from loopy.target.execution import (KernelExecutorBase,
+from loopy.target.execution import (ExecutorBase,
                              ExecutionWrapperGeneratorBase, get_highlighted_code)
 
 import logging
@@ -56,6 +56,17 @@ def _lpy_even_div(a, b):
         # FIXME: This error message is kind of crummy.
         raise ValueError("expected even division")
     return result
+
+
+def _lpy_even_div_none(a, b):
+    if a is None:
+        return None
+
+    result, remdr = divmod(a, b)
+    if remdr != 0:
+        # FIXME: This error message is kind of crummy.
+        raise ValueError("expected even division")
+    return result
 """
 
 
@@ -269,16 +280,17 @@ def __init__(self, toolchain=None,
                 # default args
                 self.toolchain = GCCToolchain(
                     cc="gcc",
+                    ld="ld",
                     cflags="-std=c99 -O3 -fPIC".split(),
                     ldflags="-shared".split(),
                     libraries=[],
                     library_dirs=[],
                     defines=[],
                     undefines=[],
-                    source_suffix="c",
                     so_ext=".so",
                     o_ext=".o",
-                    include_dirs=[])
+                    include_dirs=[],
+                    features=set())
 
         if toolchain is None:
             # copy in all differing values
@@ -443,9 +455,9 @@ class _KernelInfo:
     invoker: Callable[..., Any]
 
 
-# {{{ CKernelExecutor
+# {{{ CExecutor
 
-class CKernelExecutor(KernelExecutorBase):
+class CExecutor(ExecutorBase):
     """An object connecting a kernel to a :class:`CompiledKernel`
     for execution.
 
@@ -472,14 +484,9 @@ def get_wrapper_generator(self):
         return CExecutionWrapperGenerator()
 
     @memoize_method
-    def translation_unit_info(
-            self, entrypoint: str,
+    def translation_unit_info(self,
             arg_to_dtype: Optional[Map[str, LoopyType]] = None) -> _KernelInfo:
-        # FIXME: Remove entrypoint argument
-        assert entrypoint == self.entrypoint
-
-        t_unit = self.get_typed_and_scheduled_translation_unit(
-                entrypoint, arg_to_dtype)
+        t_unit = self.get_typed_and_scheduled_translation_unit(arg_to_dtype)
 
         from loopy.codegen import generate_code_v2
         codegen_result = generate_code_v2(t_unit)
@@ -488,18 +495,18 @@ def translation_unit_info(
         host_code = codegen_result.host_code()
         all_code = "\n".join([dev_code, "", host_code])
 
-        if t_unit[entrypoint].options.write_code:
+        if t_unit[self.entrypoint].options.write_code:
             output = all_code
-            if t_unit[entrypoint].options.allow_terminal_colors:
+            if t_unit[self.entrypoint].options.allow_terminal_colors:
                 output = get_highlighted_code(output)
 
-            if t_unit[entrypoint].options.write_code is True:
+            if t_unit[self.entrypoint].options.write_code is True:
                 print(output)
             else:
-                with open(t_unit[entrypoint].options.write_code, "w") as outf:
+                with open(t_unit[self.entrypoint].options.write_code, "w") as outf:
                     outf.write(output)
 
-        if t_unit[entrypoint].options.edit_code:
+        if t_unit[self.entrypoint].options.edit_code:
             from pytools import invoke_editor
             dev_code = invoke_editor(dev_code, "code.c")
             # update code from editor
@@ -508,18 +515,18 @@ def translation_unit_info(
         c_kernels = []
 
         from loopy.schedule.tools import get_kernel_arg_info
-        kai = get_kernel_arg_info(t_unit[entrypoint])
+        kai = get_kernel_arg_info(t_unit[self.entrypoint])
         for dp in codegen_result.device_programs:
             c_kernels.append(CompiledCKernel(
-                t_unit[entrypoint], dp, kai.passed_names, all_code,
+                t_unit[self.entrypoint], dp, kai.passed_names, all_code,
                 self.compiler))
 
         return _KernelInfo(
                 t_unit=t_unit,
                 c_kernels=c_kernels,
-                invoker=self.get_invoker(t_unit, entrypoint, codegen_result))
+                invoker=self.get_invoker(t_unit, self.entrypoint, codegen_result))
 
-    def __call__(self, *args, entrypoint=None, **kwargs):
+    def __call__(self, *args, **kwargs):
         """
         :returns: ``(None, output)`` the output is a tuple of output arguments
             (arguments that are written as part of the kernel). The order is given
@@ -529,16 +536,13 @@ def __call__(self, *args, entrypoint=None, **kwargs):
             :class:`dict` instead, with keys of argument names and values
             of the returned arrays.
         """
-        assert entrypoint is not None
-
         if __debug__:
             self.check_for_required_array_arguments(kwargs.keys())
 
         if self.packing_controller is not None:
             kwargs = self.packing_controller(kwargs)
 
-        program_info = self.translation_unit_info(entrypoint,
-                self.arg_to_dtype(kwargs))
+        program_info = self.translation_unit_info(self.arg_to_dtype(kwargs))
 
         return program_info.invoker(
                 program_info.c_kernels, *args, **kwargs)
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 2f7ef35f5..f0c1fabd5 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -41,6 +41,7 @@
 from loopy.tools import is_integer
 from loopy.types import LoopyType
 from loopy.target.c import CExpression
+from loopy.typing import ExpressionT
 
 
 __doc__ = """
@@ -84,7 +85,7 @@ def with_assignments(self, names_to_vars):
         type_inf_mapper = self.type_inf_mapper.with_assignments(names_to_vars)
         return type(self)(self.codegen_state, self.fortran_abi, type_inf_mapper)
 
-    def infer_type(self, expr):
+    def infer_type(self, expr: ExpressionT) -> LoopyType:
         result = self.type_inf_mapper(expr)
         assert isinstance(result, LoopyType)
 
@@ -331,7 +332,7 @@ def map_linear_subscript(self, expr, type_context):
     def make_subscript(self, array, base_expr, subscript):
         return base_expr[subscript]
 
-    def map_integer_div_operator(self, base_func_name, op_func, expr, type_context):
+    def _map_integer_div_operator(self, base_func_name, op_func, expr, type_context):
         from loopy.symbolic import get_dependencies
         iname_deps = get_dependencies(expr) & self.kernel.all_inames()
         domain = self.kernel.get_inames_domain(iname_deps)
@@ -342,6 +343,11 @@ def map_integer_div_operator(self, base_func_name, op_func, expr, type_context):
 
         num_type = self.infer_type(expr.numerator)
         den_type = self.infer_type(expr.denominator)
+
+        if not num_type.is_integral() or not den_type.is_integral():
+            raise NotImplementedError("remainder and floordiv "
+                                      "for floating-point types")
+
         from loopy.isl_helpers import is_nonnegative
         num_nonneg = is_nonnegative(expr.numerator, domain) \
             or num_type.numpy_dtype.kind == "u"
@@ -362,10 +368,10 @@ def seen_func(name):
         if den_nonneg:
             if num_nonneg:
                 return op_func(
-                        self.rec(expr.numerator, type_context),
-                        self.rec(expr.denominator, type_context))
+                        self.rec(expr.numerator, "i"),
+                        self.rec(expr.denominator, "i"))
             else:
-                seen_func("%s_pos_b" % base_func_name)
+                seen_func(f"{base_func_name}_pos_b")
                 return var(f"{base_func_name}_pos_b_{suffix}")(
                         self.rec(expr.numerator, "i"),
                         self.rec(expr.denominator, "i"))
@@ -377,7 +383,7 @@ def seen_func(name):
 
     def map_floor_div(self, expr, type_context):
         import operator
-        return self.map_integer_div_operator(
+        return self._map_integer_div_operator(
                 "loopy_floor_div", operator.floordiv, expr, type_context)
 
     def map_remainder(self, expr, type_context):
@@ -386,7 +392,7 @@ def map_remainder(self, expr, type_context):
             raise RuntimeError("complex remainder not defined")
 
         import operator
-        return self.map_integer_div_operator(
+        return self._map_integer_div_operator(
                 "loopy_mod", operator.mod, expr, type_context)
 
     def map_if(self, expr, type_context):
@@ -431,10 +437,8 @@ def map_constant(self, expr, type_context):
                  " The generated code will be equivalent with the added benefit"
                  " of sound pickling/unpickling of kernel objects.")
             from pymbolic.primitives import NaN
-            if not isinstance(expr, np.generic):
-                return self.map_nan(NaN(), type_context)
-            else:
-                return self.map_nan(NaN(expr.dtype.type), type_context)
+            data_type = expr.dtype.type if isinstance(expr, np.generic) else None
+            return self.map_nan(NaN(data_type), type_context)
         elif np.isneginf(expr):
             return -p.Variable("INFINITY")
         elif np.isinf(expr):
@@ -788,13 +792,10 @@ def _map_division_operator(self, operator, expr, enclosing_prec):
                 force_parens_around=self.multiplicative_primitives)
 
         return self.parenthesize_if_needed(
-                "{} {} {}".format(
-                    # Space is necessary--otherwise '/*'
-                    # (i.e. divide-dererference) becomes
-                    # start-of-comment in C.
-                    num_s,
-                    operator,
-                    denom_s),
+                f"{num_s} {operator} {denom_s}",
+                # Space is necessary--otherwise '/*'
+                # (i.e. divide-dererference) becomes
+                # start-of-comment in C.
                 enclosing_prec, PREC_PRODUCT)
 
     def map_quotient(self, expr, enclosing_prec):
diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte
index 165b3abae..d4549d4c7 160000
--- a/loopy/target/c/compyte
+++ b/loopy/target/c/compyte
@@ -1 +1 @@
-Subproject commit 165b3abae63bc39124a342ce1a539adbf6cd8a09
+Subproject commit d4549d4c711513e2cc098d3f5d4e918eac53ee7a
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index f20b8c15a..1c5e601d4 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -525,9 +525,9 @@ def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
                     cast_str = "(%s *) " % (ctype)
 
                 return Block([
-                    POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
+                    POD(self, NumpyType(lhs_dtype.dtype),
                         old_val_var),
-                    POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
+                    POD(self, NumpyType(lhs_dtype.dtype),
                         new_val_var),
                     DoWhile(
                         "atomicCAS("
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index 2584fc4ab..1e49de938 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -38,7 +38,7 @@
 logger = logging.getLogger(__name__)
 
 from pytools.persistent_dict import WriteOncePersistentDict
-from loopy.tools import LoopyKeyBuilder
+from loopy.tools import LoopyKeyBuilder, caches
 from loopy.typing import ExpressionT
 from loopy.types import LoopyType, NumpyType
 from loopy.kernel import KernelState, LoopKernel
@@ -107,11 +107,11 @@ class _ArgFindingEquation:
     lhs: ExpressionT
     rhs: ExpressionT
 
-    # Arg finding code is sorted by priority, lowest order first
+    # Arg finding code is sorted by priority, all equations (across all unknowns)
+    # of lowest priority first.
     order: int
 
     based_on_names: FrozenSet[str]
-    require_names: bool
 
 
 class ExecutionWrapperGeneratorBase(ABC):
@@ -164,8 +164,6 @@ def generate_integer_arg_finding_from_array_data(
 
         equations: List[_ArgFindingEquation] = []
 
-        from pymbolic.primitives import If
-
         for arg_name in kai.passed_arg_names:
             arg = kernel.arg_dict[arg_name]
             assert arg.dtype is not None
@@ -179,10 +177,10 @@ def generate_integer_arg_finding_from_array_data(
                                     lhs=var(arg.name).attr("shape").index(axis_nr),
                                     rhs=shape_i,
                                     order=0,
-                                    based_on_names=frozenset({arg.name}),
-                                    require_names=True))
+                                    based_on_names=frozenset({arg.name})))
 
-                for axis_nr, stride_i in enumerate(get_strides(arg)):
+                strides = get_strides(arg)
+                for axis_nr, stride_i in enumerate(strides):
                     if stride_i is not None:
                         equations.append(
                                 _ArgFindingEquation(
@@ -192,43 +190,68 @@ def generate_integer_arg_finding_from_array_data(
                                     rhs=_str_to_expr(stride_i),
                                     order=0,
                                     based_on_names=frozenset({arg.name}),
-                                    require_names=True))
-
-                if arg.offset is not None:
-                    if not kernel.options.no_numpy:
-                        offset = var("getattr")(var(arg.name), var('"offset"'), 0)
-                    else:
-                        offset = var(arg.name).attr("offset")
+                                    ))
 
-                    offset = If(var(f"{arg.name} is None"), 0, offset)
+                        if not arg.is_input and isinstance(arg.shape, tuple):
+                            # If no value was found by other means, provide
+                            # C-contiguous default strides for output-only
+                            # arguments.
+                            equations.append(
+                                    _ArgFindingEquation(
+                                        lhs=(strides[axis_nr + 1]
+                                             * arg.shape[axis_nr + 1])
+                                        if axis_nr + 1 < len(strides)
+                                        else 1,
+                                        rhs=_str_to_expr(stride_i),
+                                        # Find strides from last dim to first,
+                                        # starting at order=1 so that shape
+                                        # parameters (found above) are
+                                        # available.
+                                        order=len(strides) - axis_nr,
+                                        based_on_names=frozenset(),
+                                        ))
 
+                if arg.offset is not None:
                     equations.append(
                             _ArgFindingEquation(
-                                lhs=var("_lpy_even_div")(
-                                    offset, arg.dtype.itemsize),
+                                lhs=var("_lpy_even_div_none")(
+                                    var("getattr")(
+                                        var(arg.name), var('"offset"'), var("None")),
+                                    arg.dtype.itemsize),
                                 rhs=_str_to_expr(arg.offset),
+                                order=0,
+                                based_on_names=frozenset([arg.name]),
+                                ))
 
-                                # Argument finding from offsets should run last,
-                                # as it assumes a zero offset if a variable is
-                                # not passed. That should only be done if no
-                                # other approach yielded a value for the variable.
+                    # If no value was found by other means, default to zero.
+                    equations.append(
+                            _ArgFindingEquation(
+                                lhs=0,
+                                rhs=_str_to_expr(arg.offset),
                                 order=1,
-                                based_on_names=frozenset(arg.name),
-                                require_names=False,
+                                based_on_names=frozenset(),
                                 ))
 
         # }}}
 
         # {{{ regroup equations by unknown
 
-        unknown_to_equations: Dict[str, List[_ArgFindingEquation]] = {}
+        order_to_unknown_to_equations: \
+                Dict[int, Dict[str, List[_ArgFindingEquation]]] = {}
 
         for eqn in equations:
             deps = dep_map(eqn.rhs)
 
             if len(deps) == 1:
                 unknown_var, = deps
-                unknown_to_equations.setdefault(unknown_var.name, []).append((eqn))
+                order_to_unknown_to_equations \
+                        .setdefault(eqn.order, {}) \
+                        .setdefault(unknown_var.name, []) \
+                        .append((eqn))
+            else:
+                # Zero deps: nothing to determine, forget about it.
+                # 2+ deps: not implemented
+                pass
 
         del equations
 
@@ -243,72 +266,67 @@ def generate_integer_arg_finding_from_array_data(
         gen("# {{{ find integer arguments from array data")
         gen("")
 
-        for unknown_name in sorted(unknown_to_equations):
-            unk_equations = sorted(unknown_to_equations[unknown_name],
-                    key=lambda eqn: eqn.order)
-            req_subgen = CodeGenerator()
-            not_req_subgen = CodeGenerator()
+        for order_value in sorted(order_to_unknown_to_equations):
+            for unknown_name in sorted(order_to_unknown_to_equations[order_value]):
+                unk_equations = sorted(
+                        order_to_unknown_to_equations[order_value][unknown_name],
+                        key=lambda eqn: eqn.order)
+                subgen = CodeGenerator()
 
-            seen_based_on_names: Set[FrozenSet[str]] = set()
+                seen_based_on_names: Set[FrozenSet[str]] = set()
 
-            if_or_elif = "if"
+                if_or_elif = "if"
 
-            for eqn in unk_equations:
-                try:
-                    # overkill :)
-                    value_expr = solve_affine_equations_for(
-                            [unknown_name],
-                            [(eqn.lhs, eqn.rhs)]
-                            )[Variable(unknown_name)]
-                except Exception as e:
-                    # went wrong? oh well
-                    from warnings import warn
-                    warn("Unable to generate code to automatically "
-                            f"find '{unknown_name}' "
-                            f"from '{', '.join(eqn.based_on_names)}':\n"
-                            f"{e}", ParameterFinderWarning)
-                    continue
-
-                # Do not use more than one bit of data from each of the
-                # 'based_on_names' to find each value, i.e. if a value can be
-                # found via shape and strides, only one of them suffices.
-                # This also helps because strides can be unreliable in the
-                # face of zero-length axes.
-                if eqn.based_on_names in seen_based_on_names:
-                    continue
-                seen_based_on_names.add(eqn.based_on_names)
-
-                if eqn.require_names:
-                    condition = " and ".join(
-                            f"{ary_name} is not None"
-                            for ary_name in eqn.based_on_names)
-                    req_subgen(f"{if_or_elif} {condition}:")
-                    with Indentation(req_subgen):
-                        req_subgen(
+                for eqn in unk_equations:
+                    if eqn.rhs == Variable(unknown_name):
+                        # Some of the expressions above are non-affine. Let's not
+                        # get carried away by trying to solve a much more complex
+                        # problem than needed.
+                        value_expr = eqn.lhs
+                    else:
+                        try:
+                            # overkill :)
+                            value_expr = solve_affine_equations_for(
+                                    [unknown_name],
+                                    [(eqn.lhs, eqn.rhs)]
+                                    )[Variable(unknown_name)]
+                        except Exception as e:
+                            # went wrong? oh well
+                            from warnings import warn
+                            warn("Unable to generate code to automatically "
+                                    f"find '{unknown_name}' "
+                                    f"from '{', '.join(eqn.based_on_names)}':\n"
+                                    f"{e}", ParameterFinderWarning)
+                            continue
+
+                    # Do not use more than one bit of data from each of the
+                    # 'based_on_names' to find each value, i.e. if a value can be
+                    # found via shape and strides, only one of them suffices.
+                    # This also helps because strides can be unreliable in the
+                    # face of zero-length axes.
+                    if eqn.based_on_names in seen_based_on_names:
+                        continue
+                    seen_based_on_names.add(eqn.based_on_names)
+
+                    if eqn.based_on_names:
+                        condition = " and ".join(
+                                f"{ary_name} is not None"
+                                for ary_name in eqn.based_on_names)
+                    else:
+                        condition = "True"
+
+                    subgen(f"{if_or_elif} {condition}:")
+                    with Indentation(subgen):
+                        subgen(
                                 f"{unknown_name} = {StringifyMapper()(value_expr)}")
                     if_or_elif = "elif"
 
-                    req_subgen("")
-                else:
-                    not_req_subgen(
-                            f"{unknown_name} = {StringifyMapper()(value_expr)}")
-
-                    not_req_subgen("")
+                    subgen("")
 
-            if not_req_subgen.code:
-                gen(f"if {unknown_name} is None:")
-                with Indentation(gen):
-                    gen.extend(not_req_subgen)
-
-                    if req_subgen.code:
-                        # still? try the req_subgen
-                        gen(f"if {unknown_name} is None:")
-                        with Indentation(gen):
-                            gen.extend(req_subgen)
-            elif req_subgen.code:
-                gen(f"if {unknown_name} is None:")
-                with Indentation(gen):
-                    gen.extend(req_subgen)
+                if subgen.code:
+                    gen(f"if {unknown_name} is None:")
+                    with Indentation(gen):
+                        gen.extend(subgen)
 
         gen("# }}}")
         gen("")
@@ -708,18 +726,24 @@ def __call__(self, program, entrypoint, codegen_result):
         key_builder=LoopyKeyBuilder())
 
 
+caches.append(typed_and_scheduled_cache)
+
+
 invoker_cache = WriteOncePersistentDict(
         "loopy-invoker-cache-v10-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
 
 
+caches.append(invoker_cache)
+
+
 # {{{ kernel executor
 
-class KernelExecutorBase:
-    """An object connecting a kernel to a :class:`pyopencl.Context`
-    for execution.
+class ExecutorBase:
+    """An object allowing the execution of an entrypoint of a
+    :class:`~loopy.TranslationUnit`. Create these objects using
+    :meth:`loopy.TranslationUnit.executor`.
 
-    .. automethod:: __init__
     .. automethod:: __call__
     """
     packing_controller: Optional[SeparateArrayPackingController]
@@ -753,14 +777,14 @@ def __init__(self, t_unit: TranslationUnit, entrypoint: str):
             self.packing_controller = SeparateArrayPackingController(self.sep_info)
         else:
             self.packing_controller = None
-            return None
 
     def check_for_required_array_arguments(self, input_args):
         # Formerly, the first exception raised when a required argument is not
         # passed was often at type inference. This exists to raise a more meaningful
         # message in such scenarios. Since type inference precedes compilation, this
         # check cannot be deferred to the generated invoker code.
-        # See discussion at github.com/inducer/loopy/pull/160#issuecomment-867761204
+        # See discussion at
+        # https://github.com/inducer/loopy/pull/160#issuecomment-867761204
         # and links therin for context.
         if not self.input_array_names <= set(input_args):
             missing_args = self.input_array_names - set(input_args)
@@ -772,12 +796,12 @@ def check_for_required_array_arguments(self, input_args):
                 "your argument.")
 
     def get_typed_and_scheduled_translation_unit_uncached(
-            self, entrypoint, arg_to_dtype: Optional[Map[str, LoopyType]]
+            self, arg_to_dtype: Optional[Map[str, LoopyType]]
             ) -> TranslationUnit:
         t_unit = self.t_unit
 
         if arg_to_dtype:
-            entry_knl = t_unit[entrypoint]
+            entry_knl = t_unit[self.entrypoint]
 
             # FIXME: This is not so nice. This transfers types from the
             # subarrays of sep-tagged arrays to the 'main' array, because
@@ -809,7 +833,7 @@ def get_typed_and_scheduled_translation_unit_uncached(
         return t_unit
 
     def get_typed_and_scheduled_translation_unit(
-            self, entrypoint: str, arg_to_dtype: Optional[Map[str, LoopyType]]
+            self, arg_to_dtype: Optional[Map[str, LoopyType]]
             ) -> TranslationUnit:
         from loopy import CACHING_ENABLED
 
@@ -824,8 +848,7 @@ def get_typed_and_scheduled_translation_unit(
         logger.debug("%s: typed-and-scheduled cache miss" %
                 self.t_unit.entrypoints)
 
-        kernel = self.get_typed_and_scheduled_translation_unit_uncached(entrypoint,
-                arg_to_dtype)
+        kernel = self.get_typed_and_scheduled_translation_unit_uncached(arg_to_dtype)
 
         if CACHING_ENABLED:
             typed_and_scheduled_cache.store_if_not_present(cache_key, kernel)
@@ -861,8 +884,7 @@ def get_highlighted_code(self, entrypoint, arg_to_dtype=None, code=None):
     def get_code(
             self, entrypoint: str,
             arg_to_dtype: Optional[Map[str, LoopyType]] = None) -> str:
-        kernel = self.get_typed_and_scheduled_translation_unit(
-                entrypoint, arg_to_dtype)
+        kernel = self.get_typed_and_scheduled_translation_unit(arg_to_dtype)
 
         from loopy.codegen import generate_code_v2
         code = generate_code_v2(kernel)
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index e5ce78c58..217f7a795 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -31,7 +31,7 @@
 from pymbolic import var
 from pymbolic.mapper.stringifier import PREC_NONE
 from pytools import memoize_method
-from cgen import Generable, Declarator, Const
+from cgen import Generable, Declarator, Const, Collection
 
 from loopy.target.c import CFamilyTarget, CFamilyASTBuilder
 from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
@@ -476,7 +476,7 @@ def emit_assignment(self, codegen_state, insn):
         return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code)
 
     def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
-            lbound, ubound, inner):
+            lbound, ubound, inner, hints):
         ecm = codegen_state.expression_to_code_mapper
 
         from loopy.target.c import POD
@@ -486,7 +486,7 @@ def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
 
         from cgen.ispc import ISPCUniform
 
-        return For(
+        loop = For(
                 InlineInitializer(
                     ISPCUniform(POD(self, iname_dtype, iname)),
                     ecm(lbound, PREC_NONE, "i")),
@@ -496,6 +496,11 @@ def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
                 "++%s" % iname,
                 inner)
 
+        if hints:
+            return Collection(list(hints) + [loop])
+        else:
+            return loop
+
     # }}}
 
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index c807a5360..247c00f02 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -782,6 +782,14 @@ def emit_atomic_init(self, codegen_state, lhs_atomicity, lhs_var,
         return self.emit_atomic_update(codegen_state, lhs_atomicity, lhs_var,
             lhs_expr, rhs_expr, lhs_dtype, rhs_type_context)
 
+    def emit_unroll_hint(self, value):
+        # See https://man.opencl.org/attributes-loopUnroll.html
+        from cgen import Line
+        if value:
+            return Line(f"__attribute__((opencl_unroll_hint({value})))")
+        else:
+            return Line("__attribute__((opencl_unroll_hint))")
+
     def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
             lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
         from pymbolic.mapper.stringifier import PREC_NONE
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 92f4bbd96..40963a85e 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -1,3 +1,4 @@
+from __future__ import annotations
 """OpenCL target integrated with PyOpenCL."""
 
 __copyright__ = "Copyright (C) 2015 Andreas Kloeckner"
@@ -23,7 +24,7 @@
 """
 
 from warnings import warn
-from typing import Sequence, Tuple, List, Union, Optional, cast
+from typing import Sequence, Tuple, List, Union, Optional, cast, Any, TYPE_CHECKING
 
 import numpy as np
 import pymbolic.primitives as p
@@ -34,8 +35,10 @@
 
 from loopy.target.opencl import (OpenCLTarget, OpenCLCASTBuilder,
         ExpressionToOpenCLCExpressionMapper)
+from loopy.target.pyopencl_execution import PyOpenCLExecutor
 from loopy.target.python import PythonASTBuilderBase
 from loopy.kernel import LoopKernel
+from loopy.translation_unit import FunctionIdT, TranslationUnit
 from loopy.types import NumpyType
 from loopy.typing import ExpressionT
 from loopy.diagnostic import LoopyError, LoopyTypeError
@@ -49,6 +52,9 @@
 import logging
 logger = logging.getLogger(__name__)
 
+if TYPE_CHECKING:
+    import pyopencl as cl
+
 
 # {{{ pyopencl function scopers
 
@@ -595,55 +601,22 @@ def alignment_requirement(self, type_decl):
     # }}}
 
     def get_kernel_executor_cache_key(self, queue, **kwargs):
-        import weakref
-        # Use weakref for CL context to avoid keeping context artifically alive
-        return (weakref.ref(queue.context), kwargs["entrypoint"])
-
-    def preprocess_translation_unit_for_passed_args(self, t_unit, epoint,
-                                                   passed_args_dict):
-
-        # {{{ ValueArgs -> GlobalArgs if passed as array shapes
-
-        from loopy.kernel.data import ValueArg, GlobalArg
-        import pyopencl.array as cla
-
-        knl = t_unit[epoint]
-        new_args = []
-
-        for arg in knl.args:
-            if isinstance(arg, ValueArg):
-                if (arg.name in passed_args_dict
-                        and isinstance(passed_args_dict[arg.name], cla.Array)
-                        and passed_args_dict[arg.name].shape == ()):
-                    arg = GlobalArg(name=arg.name, dtype=arg.dtype, shape=(),
-                                    is_output=False, is_input=True)
-
-            new_args.append(arg)
-
-        knl = knl.copy(args=new_args)
-
-        t_unit = t_unit.with_kernel(knl)
-
-        # }}}
-
-        return t_unit
-
-    def get_kernel_executor(self, program, queue, **kwargs):
-        from loopy.target.pyopencl_execution import PyOpenCLKernelExecutor
-
-        epoint = kwargs.pop("entrypoint")
-        program = self.preprocess_translation_unit_for_passed_args(program,
-                                                                   epoint,
-                                                                   kwargs)
-
-        return PyOpenCLKernelExecutor(queue.context, program,
-                                      entrypoint=epoint)
+        return (queue.context, kwargs["entrypoint"])
+
+    # type-ignore because we're making things from *args: Any more concrete,
+    # and mypy doesn't like it.
+    def get_kernel_executor(self, t_unit: TranslationUnit,  # type: ignore[override]
+                            queue_or_context: Union[cl.CommandQueue, cl.Context],
+                            *args: Any, entrypoint: FunctionIdT, **kwargs: Any
+                            ) -> PyOpenCLExecutor:
+        from pyopencl import CommandQueue
+        if isinstance(queue_or_context, CommandQueue):
+            context = queue_or_context.context
+        else:
+            context = queue_or_context
 
-    def with_device(self, device):
-        from warnings import warn
-        warn("PyOpenCLTarget.with_device is deprecated, it will "
-                "stop working in 2022.", DeprecationWarning, stacklevel=2)
-        return self
+        from loopy.target.pyopencl_execution import PyOpenCLExecutor
+        return PyOpenCLExecutor(context, t_unit, entrypoint=entrypoint)
 
 # }}}
 
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index dd253a223..b65bdc66e 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -34,10 +34,9 @@
 from loopy.typing import ExpressionT
 from loopy.kernel import LoopKernel
 from loopy.kernel.data import ArrayArg
-from loopy.translation_unit import TranslationUnit
 from loopy.schedule.tools import KernelArgInfo
 from loopy.target.execution import (
-    KernelExecutorBase, ExecutionWrapperGeneratorBase)
+    ExecutorBase, ExecutionWrapperGeneratorBase)
 import logging
 logger = logging.getLogger(__name__)
 
@@ -270,7 +269,6 @@ def get_arg_pass(self, arg):
 
 @dataclass(frozen=True)
 class _KernelInfo:
-    t_unit: TranslationUnit
     cl_kernels: "_Kernels"
     invoker: Callable[..., Any]
 
@@ -281,7 +279,7 @@ class _Kernels:
 
 # {{{ kernel executor
 
-class PyOpenCLKernelExecutor(KernelExecutorBase):
+class PyOpenCLExecutor(ExecutorBase):
     """An object connecting a kernel to a :class:`pyopencl.Context`
     for execution.
 
@@ -303,10 +301,9 @@ def get_wrapper_generator(self):
 
     @memoize_method
     def translation_unit_info(
-            self, entrypoint: str,
+            self,
             arg_to_dtype: Optional[Map[str, LoopyType]] = None) -> _KernelInfo:
-        t_unit = self.get_typed_and_scheduled_translation_unit(
-                entrypoint, arg_to_dtype)
+        t_unit = self.get_typed_and_scheduled_translation_unit(arg_to_dtype)
 
         # FIXME: now just need to add the types to the arguments
         from loopy.codegen import generate_code_v2
@@ -315,19 +312,21 @@ def translation_unit_info(
 
         dev_code = codegen_result.device_code()
 
-        if t_unit[entrypoint].options.write_code:
+        if t_unit[self.entrypoint].options.write_code:
             #FIXME: redirect to "translation unit" level option as well.
             output = dev_code
-            if self.t_unit[entrypoint].options.allow_terminal_colors:
+            if self.t_unit[self.entrypoint].options.allow_terminal_colors:
                 output = get_highlighted_code(output)
 
-            if self.t_unit[entrypoint].options.write_code is True:
+            if self.t_unit[self.entrypoint].options.write_code is True:
                 print(output)
             else:
-                with open(self.t_unit[entrypoint].options.write_code, "w") as outf:
+                with open(
+                        self.t_unit[self.entrypoint].options.write_code, "w"
+                        ) as outf:
                     outf.write(output)
 
-        if t_unit[entrypoint].options.edit_code:
+        if t_unit[self.entrypoint].options.edit_code:
             #FIXME: redirect to "translation unit" level option as well.
             from pytools import invoke_editor
             dev_code = invoke_editor(dev_code, "code.cl")
@@ -337,19 +336,18 @@ def translation_unit_info(
         #FIXME: redirect to "translation unit" level option as well.
         cl_program = (
                 cl.Program(self.context, dev_code)
-                .build(options=t_unit[entrypoint].options.build_options))
+                .build(options=t_unit[self.entrypoint].options.build_options))
 
         cl_kernels = _Kernels()
         for dp in cl_program.kernel_names.split(";"):
             setattr(cl_kernels, dp, getattr(cl_program, dp))
 
         return _KernelInfo(
-                t_unit=t_unit,
                 cl_kernels=cl_kernels,
-                invoker=self.get_invoker(t_unit, entrypoint, codegen_result))
+                invoker=self.get_invoker(t_unit, self.entrypoint, codegen_result))
 
     def __call__(self, queue, *,
-            allocator=None, wait_for=None, out_host=None, entrypoint=None,
+            allocator=None, wait_for=None, out_host=None,
             **kwargs):
         """
         :arg allocator: a callable passed a byte count and returning
@@ -377,19 +375,13 @@ def __call__(self, queue, *,
             of the returned arrays.
         """
 
-        assert entrypoint is not None
-
-        # FIXME: Remove entrypoint argument
-        assert entrypoint == self.entrypoint
-
         if __debug__:
             self.check_for_required_array_arguments(kwargs.keys())
 
         if self.packing_controller is not None:
             kwargs = self.packing_controller(kwargs)
 
-        translation_unit_info = self.translation_unit_info(entrypoint,
-                self.arg_to_dtype(kwargs))
+        translation_unit_info = self.translation_unit_info(self.arg_to_dtype(kwargs))
 
         return translation_unit_info.invoker(
                 translation_unit_info.cl_kernels, queue, allocator, wait_for,
diff --git a/loopy/target/python.py b/loopy/target/python.py
index f9cc06147..f93d2b44e 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -226,12 +226,15 @@ def ast_block_scope_class(self):
         return Collection
 
     def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
-            lbound, ubound, inner):
+            lbound, ubound, inner, hints):
         ecm = codegen_state.expression_to_code_mapper
 
         from pymbolic.mapper.stringifier import PREC_NONE, PREC_SUM
         from genpy import For
 
+        if hints:
+            raise ValueError("hints for python loops not supported")
+
         return For(
                 (iname,),
                 "range(%s, %s + 1)"
diff --git a/loopy/tools.py b/loopy/tools.py
index 2b64e7325..ca4256b0d 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -20,6 +20,7 @@
 THE SOFTWARE.
 """
 
+from typing import List
 import collections.abc as abc
 from functools import cached_property
 
@@ -27,7 +28,8 @@
 import islpy as isl
 import numpy as np
 from pytools import memoize_method, ProcessLogger
-from pytools.persistent_dict import KeyBuilder as KeyBuilderBase
+from pytools.persistent_dict import (
+        KeyBuilder as KeyBuilderBase, WriteOncePersistentDict)
 from loopy.symbolic import (UncachedWalkMapper as LoopyWalkMapper,
                             RuleAwareIdentityMapper)
 from pymbolic.mapper.persistent_hash import (
@@ -101,11 +103,6 @@ def update_for_dict(self, key_hash, key):
 
     update_for_defaultdict = update_for_dict
 
-    def update_for_frozenset(self, key_hash, key):
-        for set_key in sorted(key,
-                key=lambda obj: type(obj).__name__ + str(obj)):
-            self.rec(key_hash, set_key)
-
     def update_for_BasicSet(self, key_hash, key):  # noqa
         from islpy import Printer
         prn = Printer.to_str(key.get_ctx())
@@ -134,7 +131,7 @@ def __init__(self, expression):
         self.expression = expression
 
     def __eq__(self, other):
-        return (type(self) == type(other)
+        return (type(self) is type(other)
                 and self.expression == other.expression)
 
     def __ne__(self, other):
@@ -862,7 +859,7 @@ def t_unit_to_python(t_unit, var_name="t_unit",
                                                                .callables_table))
                      for name, clbl in t_unit.callables_table.items()
                      if isinstance(clbl, CallableKernel)}
-    t_unit = t_unit.copy(callables_table=new_callables)
+    t_unit = t_unit.copy(callables_table=Map(new_callables))
 
     knl_python_code_srcs = [_kernel_to_python(clbl.subkernel,
                                               name in t_unit.entrypoints,
@@ -892,6 +889,18 @@ def t_unit_to_python(t_unit, var_name="t_unit",
 # }}}
 
 
+# {{{ cache management
+
+caches: List[WriteOncePersistentDict] = []
+
+
+def clear_in_mem_caches() -> None:
+    for cache in caches:
+        cache.clear_in_mem_cache()
+
+# }}}
+
+
 # {{{ memoize_on_disk
 
 def memoize_on_disk(func, key_builder_t=LoopyKeyBuilder):
@@ -909,6 +918,8 @@ def memoize_on_disk(func, key_builder_t=LoopyKeyBuilder):
             f"-v0-{DATA_MODEL_VERSION}"),
         key_builder=key_builder_t())
 
+    caches.append(transform_cache)
+
     @wraps(func)
     def wrapper(*args, **kwargs):
         from loopy import CACHING_ENABLED
diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py
index 3c4092b74..f04719c77 100644
--- a/loopy/transform/array_buffer_map.py
+++ b/loopy/transform/array_buffer_map.py
@@ -21,16 +21,23 @@
 """
 
 
+from dataclasses import dataclass, replace
+from abc import ABC, abstractmethod
+from typing import Optional, Callable, Sequence, Tuple, Any
+from typing_extensions import Self
 import islpy as isl
 from islpy import dim_type
 from loopy.symbolic import (get_dependencies, SubstitutionMapper)
 from pymbolic.mapper.substitutor import make_subst_func
 
-from pytools import ImmutableRecord, memoize_method
+from pytools import memoize_method
 from pymbolic import var
 
+from loopy.typing import ExpressionT
 
-class AccessDescriptor(ImmutableRecord):
+
+@dataclass(frozen=True)
+class AccessDescriptor:
     """
     .. attribute:: identifier
 
@@ -38,10 +45,11 @@ class AccessDescriptor(ImmutableRecord):
         to the access that generated it. Any Python value.
     """
 
-    __slots__ = [
-            "identifier",
-            "storage_axis_exprs",
-            ]
+    identifier: Any = None
+    storage_axis_exprs: Optional[Sequence[ExpressionT]] = None
+
+    def copy(self, **kwargs) -> Self:
+        return replace(self, **kwargs)
 
 
 def to_parameters_or_project_out(param_inames, set_inames, set):
@@ -62,9 +70,12 @@ def to_parameters_or_project_out(param_inames, set_inames, set):
 
 # {{{ construct storage->sweep map
 
-def build_per_access_storage_to_domain_map(storage_axis_exprs, domain,
-        storage_axis_names,
-        prime_sweep_inames):
+def build_per_access_storage_to_domain_map(
+        storage_axis_exprs: Sequence[ExpressionT],
+        domain: isl.BasicSet,
+        storage_axis_names: Sequence[str],
+        prime_sweep_inames: Callable[[ExpressionT], ExpressionT]
+        ) -> isl.BasicMap:
 
     map_space = domain.space
     stor_dim = len(storage_axis_names)
@@ -124,10 +135,8 @@ def move_to_par_from_out(s2smap, except_inames):
             return s2smap
 
 
-def build_global_storage_to_sweep_map(kernel, access_descriptors,
-        domain_dup_sweep, dup_sweep_index,
-        storage_axis_names,
-        sweep_inames, primed_sweep_inames, prime_sweep_inames):
+def build_global_storage_to_sweep_map(access_descriptors,
+        domain_dup_sweep, storage_axis_names, prime_sweep_inames):
     # The storage map goes from storage axes to the domain.
     # The first len(arg_names) storage dimensions are the rule's arguments.
 
@@ -192,7 +201,23 @@ def compute_bounds(kernel, domain, stor2sweep,
 
 # {{{ array-to-buffer map
 
-class ArrayToBufferMap:
+class ArrayToBufferMapBase(ABC):
+    non1_storage_axis_names: Tuple[str, ...]
+    storage_base_indices: Tuple[ExpressionT, ...]
+    non1_storage_shape: Tuple[ExpressionT, ...]
+    non1_storage_axis_flags: Tuple[ExpressionT, ...]
+
+    @abstractmethod
+    def is_access_descriptor_in_footprint(self, accdesc: AccessDescriptor) -> bool:
+        ...
+
+    @abstractmethod
+    def augment_domain_with_sweep(self, domain, new_non1_storage_axis_names,
+            boxify_sweep=False):
+        ...
+
+
+class ArrayToBufferMap(ArrayToBufferMapBase):
     def __init__(self, kernel, domain, sweep_inames, access_descriptors,
             storage_axis_count):
         self.kernel = kernel
@@ -221,10 +246,10 @@ def __init__(self, kernel, domain, sweep_inames, access_descriptors,
         # # }}}
 
         self.stor2sweep = build_global_storage_to_sweep_map(
-                kernel, access_descriptors,
-                domain_dup_sweep, dup_sweep_index,
+                access_descriptors,
+                domain_dup_sweep,
                 storage_axis_names,
-                sweep_inames, self.primed_sweep_inames, self.prime_sweep_inames)
+                self.prime_sweep_inames)
 
         storage_base_indices, storage_shape = compute_bounds(
                 kernel, domain, self.stor2sweep, self.primed_sweep_inames,
@@ -298,7 +323,7 @@ def __init__(self, kernel, domain, sweep_inames, access_descriptors,
         self.non1_storage_axis_flags = non1_storage_axis_flags
         self.aug_domain = aug_domain
         self.storage_base_indices = storage_base_indices
-        self.non1_storage_shape = non1_storage_shape
+        self.non1_storage_shape = tuple(non1_storage_shape)
 
     def augment_domain_with_sweep(self, domain, new_non1_storage_axis_names,
             boxify_sweep=False):
@@ -336,7 +361,8 @@ def augment_domain_with_sweep(self, domain, new_non1_storage_axis_names,
         else:
             return convexify(domain)
 
-    def is_access_descriptor_in_footprint(self, accdesc):
+    def is_access_descriptor_in_footprint(self, accdesc: AccessDescriptor) -> bool:
+        assert accdesc.storage_axis_exprs is not None
         return self._is_access_descriptor_in_footprint_inner(
                 tuple(accdesc.storage_axis_exprs))
 
@@ -399,17 +425,20 @@ def _is_access_descriptor_in_footprint_inner(self, storage_axis_exprs):
                 aligned_g_s2s_parm_dom)
 
 
-class NoOpArrayToBufferMap:
+class NoOpArrayToBufferMap(ArrayToBufferMapBase):
     non1_storage_axis_names = ()
     storage_base_indices = ()
     non1_storage_shape = ()
 
-    def is_access_descriptor_in_footprint(self, accdesc):
+    def is_access_descriptor_in_footprint(self, accdesc: AccessDescriptor) -> bool:
         # no index dependencies--every reference to the subst rule
         # is necessarily in the footprint.
 
         return True
 
+    def augment_domain_with_sweep(self, domain, new_non1_storage_axis_names,
+            boxify_sweep=False):
+        return domain
 # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index b3fc69671..b77c6a5ed 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -20,6 +20,7 @@
 THE SOFTWARE.
 """
 
+from immutables import Map
 from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap,
         AccessDescriptor)
 from loopy.symbolic import (get_dependencies,
@@ -524,7 +525,7 @@ def buffer_array(program, *args, **kwargs):
 
         new_callables[func_id] = clbl
 
-    return program.copy(callables_table=new_callables)
+    return program.copy(callables_table=Map(new_callables))
 
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py
index 3b239dfc7..33196ca67 100644
--- a/loopy/transform/callable.py
+++ b/loopy/transform/callable.py
@@ -21,6 +21,7 @@
 """
 
 import islpy as isl
+from immutables import Map
 
 from pytools import UniqueNameGenerator
 
@@ -105,12 +106,12 @@ def merge(translation_units):
 
     callables_table = {}
     for trans_unit in translation_units:
-        callables_table.update(trans_unit.callables_table.copy())
+        callables_table.update(trans_unit.callables_table)
 
     return TranslationUnit(
             entrypoints=frozenset().union(*(
                 t.entrypoints or frozenset() for t in translation_units)),
-            callables_table=callables_table,
+            callables_table=Map(callables_table),
             target=translation_units[0].target)
 
 
@@ -576,7 +577,7 @@ def rename_callable(program, old_name, new_name=None, existing_ok=False):
         new_entrypoints = ((new_entrypoints | frozenset([new_name]))
                            - frozenset([old_name]))
 
-    return program.copy(callables_table=new_callables_table,
+    return program.copy(callables_table=Map(new_callables_table),
                         entrypoints=new_entrypoints)
 
 # }}}
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index e2c21e0dc..1b97087b8 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -24,16 +24,16 @@
 
 from dataclasses import dataclass, replace
 
-from typing import Optional, Tuple, Dict
+from typing import Optional, Tuple, Dict, cast
 
 import numpy as np
-
+from immutables import Map
 from islpy import dim_type
 
 from pytools import MovedFunctionDeprecationWrapper
 
 from loopy.diagnostic import LoopyError
-from loopy.kernel.data import ImageArg, auto, TemporaryVariable
+from loopy.kernel.data import AddressSpace, ImageArg, auto, TemporaryVariable
 
 from loopy.types import LoopyType
 from loopy.typing import ExpressionT
@@ -146,15 +146,10 @@ def _process_footprint_subscripts(kernel, rule_name, sweep_inames,
 # }}}
 
 
-class _not_provided:  # noqa: N801
-    pass
-
-
 def add_prefetch_for_single_kernel(kernel, callables_table, var_name,
         sweep_inames=None, dim_arg_names=None,
 
-        # "None" is a valid value here, distinct from the default.
-        default_tag=_not_provided,
+        default_tag=None,
 
         rule_name=None,
         temporary_name=None,
@@ -414,7 +409,7 @@ def add_prefetch(program, *args, **kwargs):
 
         new_callables[func_id] = in_knl_callable
 
-    return program.copy(callables_table=new_callables)
+    return program.copy(callables_table=Map(new_callables))
 
 # }}}
 
@@ -1002,7 +997,8 @@ def allocate_temporaries_for_base_storage(kernel: LoopKernel,
 
     vng = kernel.get_var_name_generator()
 
-    name_aspace_dtype_to_bsi: Dict[Tuple[str, int, LoopyType], _BaseStorageInfo] = {}
+    name_aspace_dtype_to_bsi: Dict[
+            Tuple[str, AddressSpace, LoopyType], _BaseStorageInfo] = {}
 
     for tv in sorted(
             kernel.temporary_variables.values(),
@@ -1037,7 +1033,8 @@ def allocate_temporaries_for_base_storage(kernel: LoopKernel,
                 # FIXME: Could use approximate values of ValueArgs
                 approx_array_nbytes = 0
 
-            bs_key = (tv.base_storage, tv.address_space, tv.dtype)
+            bs_key = (tv.base_storage,
+                      cast(AddressSpace, tv.address_space), tv.dtype)
             bsi = name_aspace_dtype_to_bsi.get(bs_key)
 
             if bsi is None or (
diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py
index 2fd39cfc2..fe0bddcf3 100644
--- a/loopy/transform/fusion.py
+++ b/loopy/transform/fusion.py
@@ -449,7 +449,7 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None):
 
     new_callables[result.name] = CallableKernel(result)
 
-    return TranslationUnit(callables_table=new_callables,
+    return TranslationUnit(callables_table=Map(new_callables),
                            target=result.target,
                            entrypoints=frozenset([result.name]))
 
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 673920364..51f970253 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -35,6 +35,7 @@
 from loopy.kernel import LoopKernel
 from loopy.kernel.function_interface import CallableKernel
 
+from typing import Optional
 
 __doc__ = """
 .. currentmodule:: loopy
@@ -2368,7 +2369,7 @@ def add_inames_for_unused_hw_axes(kernel, within=None):
 @for_each_kernel
 @remove_any_newly_unused_inames
 def rename_inames(kernel, old_inames, new_iname, existing_ok=False,
-                  within=None, raise_on_domain_mismatch: bool = __debug__):
+                  within=None, raise_on_domain_mismatch: Optional[bool] = None):
     r"""
     :arg old_inames: A collection of inames that must be renamed to **new_iname**.
     :arg within: a stack match as understood by
@@ -2396,6 +2397,9 @@ def rename_inames(kernel, old_inames, new_iname, existing_ok=False,
         raise LoopyError("old_inames contains nested inames"
                          " -- renaming is illegal.")
 
+    if raise_on_domain_mismatch is None:
+        raise_on_domain_mismatch = __debug__
+
     # sort to have deterministic implementation.
     old_inames = sorted(old_inames)
 
@@ -2504,18 +2508,23 @@ def does_insn_involve_iname(kernel, insn, *args):
 
 @for_each_kernel
 def rename_iname(kernel, old_iname, new_iname, existing_ok=False,
-                 within=None, preserve_tags=True):
-    """
+                 within=None, preserve_tags=True,
+                 raise_on_domain_mismatch: Optional[bool] = None):
+    r"""
     Single iname version of :func:`loopy.rename_inames`.
-    :arg existing_ok: execute even if *new_iname* already exists
+    :arg existing_ok: execute even if *new_iname* already exists.
     :arg within: a stack match understood by :func:`loopy.match.parse_stack_match`.
-    :arg preserve_tags: copy the tags on the old iname to the new iname
+    :arg preserve_tags: copy the tags on the old iname to the new iname.
+    :arg raise_on_domain_mismatch: If *True*, raises an error if
+    :math:`\exists (i_1,i_2) \in \{\text{old\_inames}\}^2 |
+    \mathcal{D}_{i_1} \neq \mathcal{D}_{i_2}`.
     """
     from itertools import product
     from loopy import tag_inames
 
     tags = kernel.inames[old_iname].tags
-    kernel = rename_inames(kernel, [old_iname], new_iname, existing_ok, within)
+    kernel = rename_inames(kernel, [old_iname], new_iname, existing_ok,
+                           within, raise_on_domain_mismatch)
     if preserve_tags:
         kernel = tag_inames(kernel, product([new_iname], tags))
     return kernel
diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py
index daf9316cd..6a39986a3 100644
--- a/loopy/transform/pack_and_unpack_args.py
+++ b/loopy/transform/pack_and_unpack_args.py
@@ -20,6 +20,7 @@
 THE SOFTWARE.
 """
 
+from immutables import Map
 from loopy.diagnostic import LoopyError
 from loopy.kernel.instruction import CallInstruction
 from loopy.translation_unit import TranslationUnit
@@ -335,6 +336,6 @@ def pack_and_unpack_args_for_call(program, *args, **kwargs):
 
         new_callables[func_id] = in_knl_callable
 
-    return program.copy(callables_table=new_callables)
+    return program.copy(callables_table=Map(new_callables))
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index aab295741..a3f0a5dd5 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -21,26 +21,35 @@
 """
 
 
-import numpy as np
+from dataclasses import dataclass
+from typing import FrozenSet, List, Mapping, Optional, Sequence, Type, Union
+from immutables import Map
 import islpy as isl
+from pytools.tag import Tag
+from loopy.kernel import LoopKernel
+from loopy.typing import ExpressionT, auto, not_none
+from loopy.match import ToStackMatchCovertible
 from loopy.symbolic import (get_dependencies,
         RuleAwareIdentityMapper, RuleAwareSubstitutionMapper,
         SubstitutionRuleMappingContext, CombineMapper)
 from loopy.diagnostic import LoopyError
 from pymbolic.mapper.substitutor import make_subst_func
 from loopy.translation_unit import TranslationUnit
-from loopy.kernel.instruction import MultiAssignmentBase
-from loopy.kernel.function_interface import CallableKernel, ScalarCallable
+from loopy.kernel.instruction import InstructionBase, MultiAssignmentBase
+from loopy.kernel.function_interface import (CallableKernel, InKernelCallable,
+                                             ScalarCallable)
 from loopy.kernel.tools import (kernel_has_global_barriers,
                                 find_most_recent_global_barrier)
 from loopy.kernel.data import AddressSpace
-from loopy.types import LoopyType
+from loopy.types import LoopyType, ToLoopyTypeConvertible, to_loopy_type
 
 from pymbolic import var
 from pytools import memoize_on_first_arg
 
-from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap,
-        AccessDescriptor)
+from loopy.transform.array_buffer_map import (ArrayToBufferMap,
+                                              ArrayToBufferMapBase,
+                                              NoOpArrayToBufferMap,
+                                              AccessDescriptor)
 
 
 # {{{ contains_subst_rule_invocation
@@ -105,15 +114,16 @@ def contains_a_subst_rule_invocation(kernel, insn):
 # }}}
 
 
+@dataclass(frozen=True)
 class RuleAccessDescriptor(AccessDescriptor):
-    __slots__ = ["args", "expansion_stack"]
+    args: Optional[Sequence[ExpressionT]] = None
 
 
 def access_descriptor_id(args, expansion_stack):
     return (args, expansion_stack)
 
 
-def storage_axis_exprs(storage_axis_sources, args):
+def storage_axis_exprs(storage_axis_sources, args) -> Sequence[ExpressionT]:
     result = []
 
     for saxis_source in storage_axis_sources:
@@ -140,7 +150,7 @@ def __init__(self, rule_mapping_context, kernel, subst_name, subst_tag, within):
         self.subst_tag = subst_tag
         self.within = within
 
-        self.access_descriptors = []
+        self.access_descriptors: List[RuleAccessDescriptor] = []
 
     def map_substitution(self, name, tag, arguments, expn_state):
         process_me = name == self.subst_name
@@ -347,23 +357,26 @@ def map_kernel(self, kernel):
 # }}}
 
 
-class _not_provided:  # noqa: N801
-    pass
-
-
-def precompute_for_single_kernel(kernel, callables_table, subst_use,
-        sweep_inames=None, within=None, storage_axes=None, temporary_name=None,
-        precompute_inames=None, precompute_outer_inames=None,
+def precompute_for_single_kernel(
+        kernel: LoopKernel,
+        callables_table: Mapping[str, InKernelCallable], subst_use,
+        sweep_inames=None,
+        within: ToStackMatchCovertible = None,
+        *,
+        storage_axes=None,
+        temporary_name: Optional[str] = None,
+        precompute_inames: Optional[Sequence[str]] = None,
+        precompute_outer_inames: Optional[FrozenSet[str]] = None,
         storage_axis_to_tag=None,
 
-        # "None" is a valid value here, distinct from the default.
-        default_tag=_not_provided,
+        default_tag: Union[None, Tag, str] = None,
 
-        dtype=None,
-        fetch_bounding_box=False,
-        temporary_address_space=None,
-        compute_insn_id=None,
-        **kwargs):
+        dtype: Optional[ToLoopyTypeConvertible] = None,
+        fetch_bounding_box: bool = False,
+        temporary_address_space: Union[AddressSpace, None, Type[auto]] = None,
+        compute_insn_id: Optional[str] = None,
+        _enable_mirgecom_workaround: bool = False,
+        ) -> LoopKernel:
     """Precompute the expression described in the substitution rule determined by
     *subst_use* and store it in a temporary array. A precomputation needs two
     things to operate, a list of *sweep_inames* (order irrelevant) and an
@@ -432,11 +445,8 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use,
         May also be specified as a comma-separated string.
 
     :arg default_tag: The :ref:`iname tag <iname-tags>` to be applied to the
-        inames created to perform the precomputation. The current default will
-        make them local axes and automatically split them to fit the work
-        group size, but this default will disappear in favor of simply leaving them
-        untagged in 2019. For 2018, a warning will be issued if no *default_tag* is
-        specified.
+        inames created to perform the precomputation. By default, new
+        inames remain untagged.
 
     :arg dtype: The dtype of the temporary variable to precompute the result
         in. Can be either a dtype as understood by :class:`numpy.dtype` or
@@ -452,23 +462,6 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use,
     Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are
     eliminated.
     """
-    if isinstance(kernel, TranslationUnit):
-        kernel_names = [i for i, clbl in
-                kernel.callables_table.items() if isinstance(clbl,
-                    CallableKernel)]
-        if len(kernel_names) != 1:
-            raise LoopyError()
-
-        return kernel.with_kernel(precompute(kernel[kernel_names[0]],
-            subst_use, sweep_inames, within, storage_axes, temporary_name,
-            precompute_inames, precompute_outer_inames, storage_axis_to_tag,
-            default_tag, dtype, fetch_bounding_box, temporary_address_space,
-            compute_insn_id, kernel.callables_table, **kwargs))
-
-    if kwargs:
-        raise TypeError("unrecognized keyword arguments: %s"
-                % ", ".join(kwargs.keys()))
-
     # {{{ check, standardize arguments
 
     if sweep_inames is None:
@@ -502,7 +495,7 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use,
 
     footprint_generators = None
 
-    subst_name = None
+    subst_name: Optional[str] = None
     subst_tag = None
 
     from pymbolic.primitives import Variable, Call
@@ -540,6 +533,8 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use,
     from loopy.match import parse_stack_match
     within = parse_stack_match(within)
 
+    assert subst_name is not None
+
     try:
         subst = kernel.substitutions[subst_name]
     except KeyError:
@@ -548,38 +543,11 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use,
 
     c_subst_name = subst_name.replace(".", "_")
 
-    # {{{ handle default_tag
-
-    from loopy.transform.data import _not_provided \
-            as transform_data_not_provided
-
-    if default_tag is _not_provided or default_tag is transform_data_not_provided:
-        # no need to warn for scalar precomputes
-        if sweep_inames:
-            from warnings import warn
-            warn(
-                    "Not specifying default_tag is deprecated, and default_tag "
-                    "will become mandatory in 2019.x. "
-                    "Pass 'default_tag=\"l.auto\" to match the current default, "
-                    "or Pass 'default_tag=None to leave the loops untagged, which "
-                    "is the recommended behavior.",
-                    DeprecationWarning, stacklevel=(
-
-                        # In this case, we came here through add_prefetch. Increase
-                        # the stacklevel.
-                        3 if default_tag is transform_data_not_provided
-
-                        else 2))
-
-        default_tag = "l.auto"
-
     from loopy.kernel.data import parse_tag
     default_tag = parse_tag(default_tag)
 
     # }}}
 
-    # }}}
-
     # {{{ process invocations in footprint generators, start access_descriptors
 
     if footprint_generators:
@@ -632,6 +600,8 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use,
     expanding_usage_arg_deps = set()
 
     for accdesc in access_descriptors:
+        assert accdesc.args is not None
+
         for arg in accdesc.args:
             expanding_usage_arg_deps.update(
                     get_dependencies(arg) & kernel.all_inames())
@@ -684,8 +654,8 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use,
 
     prior_storage_axis_name_dict = {}
 
-    storage_axis_names = []
-    storage_axis_sources = []  # number for arg#, or iname
+    storage_axis_names: List[str] = []
+    storage_axis_sources: List[Union[str, int]] = []  # number for arg#, or iname
 
     # {{{ check for pre-existing precompute_inames
 
@@ -726,8 +696,11 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use,
 
         storage_axis_names.append(name)
         if name not in preexisting_precompute_inames:
-            new_iname_to_tag[name] = storage_axis_to_tag.get(
-                    tag_lookup_saxis, default_tag)
+            iname_tag = storage_axis_to_tag.get(tag_lookup_saxis, None)
+            if iname_tag is None:
+                iname_tag = default_tag
+            if iname_tag is not None:
+                new_iname_to_tag[name] = iname_tag
 
         prior_storage_axis_name_dict[name] = old_name
 
@@ -770,7 +743,8 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use,
 
         # }}}
 
-        abm = ArrayToBufferMap(kernel, domch.domain, sweep_inames,
+        abm: ArrayToBufferMapBase = ArrayToBufferMap(
+                kernel, domch.domain, sweep_inames,
                 access_descriptors, len(storage_axis_names))
 
         non1_storage_axis_names = []
@@ -778,7 +752,8 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use,
             if abm.non1_storage_axis_flags[i]:
                 non1_storage_axis_names.append(saxis)
             else:
-                del new_iname_to_tag[saxis]
+                if saxis in new_iname_to_tag:
+                    del new_iname_to_tag[saxis]
 
                 if saxis in preexisting_precompute_inames:
                     raise LoopyError("precompute axis %d (1-based) was "
@@ -911,14 +886,38 @@ def add_assumptions(d):
 
     storage_axis_subst_dict = {}
 
-    for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices):
-        if arg_name in non1_storage_axis_names:
-            arg = var(arg_name)
-        else:
+    for i, (arg_name, base_index) in enumerate(
+            zip(storage_axis_names, abm.storage_base_indices)):
+        is_length_1 = arg_name not in non1_storage_axis_names
+        if is_length_1:
             arg = 0
+        else:
+            arg = var(arg_name)
+
+        # FIXME: Hacky workaround, remove when no longer needed.
+        # Some transform code in the mirgecom transform stack
+        # first deletes inames from instructions if they're unused and then
+        # gets upset when they've disappeared. Without this 'special handling'
+        # here, this code will replace 0-length axis subscripts with '0', as it
+        # should.
+
+        if _enable_mirgecom_workaround:
+            from pymbolic.primitives import Expression
+            if is_length_1 and not isinstance(base_index, Expression):
+                # I.e. base_index is an integer.
+                from pytools import is_single_valued
+                if is_single_valued(
+                        not_none(accdesc.storage_axis_exprs)[i]
+                        for accdesc in access_descriptors):
+                    assert access_descriptors[0].storage_axis_exprs is not None
+                    storage_axis_expr = access_descriptors[0].storage_axis_exprs[i]
+                    if not (get_dependencies(storage_axis_expr) & sweep_inames_set):
+                        # I.e. no sweeping in this axis.
+                        base_index = storage_axis_expr
 
         storage_axis_subst_dict[
-                prior_storage_axis_name_dict.get(arg_name, arg_name)] = arg+bi
+                prior_storage_axis_name_dict.get(arg_name, arg_name)] = \
+                        arg+base_index
 
     rule_mapping_context = SubstitutionRuleMappingContext(
             kernel.substitutions, kernel.get_var_name_generator())
@@ -944,7 +943,7 @@ def add_assumptions(d):
             # within_inames determined below
             )
     compute_dep_id = compute_insn_id
-    added_compute_insns = [compute_insn]
+    added_compute_insns: List[InstructionBase] = [compute_insn]
 
     if temporary_address_space == AddressSpace.GLOBAL:
         barrier_insn_id = kernel.make_unique_instruction_id(
@@ -976,7 +975,7 @@ def add_assumptions(d):
 
     kernel = invr.map_kernel(kernel)
     kernel = kernel.copy(
-            instructions=added_compute_insns + kernel.instructions)
+            instructions=added_compute_insns + list(kernel.instructions))
     kernel = rule_mapping_context.finish_kernel(kernel)
 
     # }}}
@@ -1011,8 +1010,11 @@ def add_assumptions(d):
                     .with_transformed_expressions(
                         lambda expr: expr_subst_map(expr, kernel, insn))  # noqa: B023,E501
                     .copy(within_inames=frozenset(
-                        storage_axis_subst_dict.get(iname, var(iname)).name
-                        for iname in insn.within_inames)))
+                        new_iname
+                        for iname in insn.within_inames
+                        for new_iname in get_dependencies(
+                                storage_axis_subst_dict.get(iname, var(iname)))
+                        )))
 
             new_insns.append(insn)
         else:
@@ -1051,19 +1053,19 @@ def add_assumptions(d):
     # {{{ set up temp variable
 
     import loopy as lp
-    if dtype is not None:
-        dtype = np.dtype(dtype)
+
+    loopy_type = to_loopy_type(dtype, allow_none=True)
 
     if temporary_address_space is None:
         temporary_address_space = lp.auto
 
     new_temp_shape = tuple(abm.non1_storage_shape)
 
-    new_temporary_variables = kernel.temporary_variables.copy()
+    new_temporary_variables = dict(kernel.temporary_variables)
     if temporary_name not in new_temporary_variables:
         temp_var = lp.TemporaryVariable(
                 name=temporary_name,
-                dtype=dtype,
+                dtype=loopy_type,
                 base_indices=(0,)*len(new_temp_shape),
                 shape=tuple(abm.non1_storage_shape),
                 address_space=temporary_address_space,
@@ -1087,6 +1089,7 @@ def add_assumptions(d):
 
         temp_var = temp_var.copy(dtype=dtype)
 
+        assert isinstance(temp_var.shape, tuple)
         if len(temp_var.shape) != len(new_temp_shape):
             raise LoopyError("Existing and new temporary '%s' do not "
                     "have matching number of dimensions ('%d' vs. '%d') "
@@ -1157,6 +1160,6 @@ def precompute(program, *args, **kwargs):
 
         new_callables[func_id] = clbl
 
-    return program.copy(callables_table=new_callables)
+    return program.copy(callables_table=Map(new_callables))
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index 2cbf21fec..3851bbdeb 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -364,6 +364,8 @@ def unprivatize_temporaries_with_inames(
 
     var_name_to_remove_indices = ir.var_name_to_remove_indices
 
+    from loopy.kernel.array import VectorArrayDimTag
+
     new_temp_vars = kernel.temporary_variables.copy()
     for tv_name, tv in new_temp_vars.items():
         remove_indices = var_name_to_remove_indices.get(tv_name, {})
@@ -374,6 +376,8 @@ def unprivatize_temporaries_with_inames(
 
         new_dim_tags = tv.dim_tags
         if new_dim_tags is not None:
+            new_dim_tags = ["vec" if isinstance(dim_tag, VectorArrayDimTag) else "c"
+                            for idim, dim_tag in enumerate(new_dim_tags)]
             new_dim_tags = tuple(dim for idim, dim in enumerate(new_dim_tags)
                 if idim not in remove_indices)
 
diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py
index b8ddabbbc..c211ab18e 100644
--- a/loopy/transform/realize_reduction.py
+++ b/loopy/transform/realize_reduction.py
@@ -36,7 +36,7 @@
 import islpy as isl
 from pymbolic.primitives import Expression
 
-from pyrsistent import PMap
+from immutables import Map
 
 from loopy.kernel.data import make_assignment
 from loopy.symbolic import ReductionCallbackMapper
@@ -90,7 +90,7 @@ class _ReductionRealizationContext:
     domains: List[isl.BasicSet]
     additional_iname_tags: Dict[str, Sequence[Tag]]
     # list only to facilitate mutation
-    boxed_callables_table: List[PMap]
+    boxed_callables_table: List[Map]
 
     # FIXME: This is a broken-by-design concept. Local-parallel scans emit a
     # reduction internally. This serves to avoid force_scan acting on that
@@ -2168,6 +2168,6 @@ def realize_reduction(t_unit, *args, **kwargs):
                 subkernel=new_knl)
         callables_table[knl.name] = in_knl_callable
 
-    return t_unit.copy(callables_table=callables_table)
+    return t_unit.copy(callables_table=Map(callables_table))
 
 # vim: foldmethod=marker
diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py
index a81732135..39fdb2275 100644
--- a/loopy/translation_unit.py
+++ b/loopy/translation_unit.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 __copyright__ = "Copyright (C) 2018 Kaushik Kulkarni"
 
 __license__ = """
@@ -21,22 +23,29 @@
 """
 
 import collections
+from collections.abc import Set as abc_Set
+from dataclasses import field, dataclass, replace
+from typing import FrozenSet, Optional, TYPE_CHECKING, Mapping, Callable, Union, Any
+from warnings import warn
 
-from pytools import ImmutableRecord
 from pymbolic.primitives import Variable
 from functools import wraps
 
 from loopy.symbolic import (RuleAwareIdentityMapper, ResolvedFunction,
                             SubstitutionRuleMappingContext)
 from loopy.kernel.function_interface import (
-        CallableKernel, ScalarCallable)
-from loopy.diagnostic import LoopyError
+        CallableKernel, InKernelCallable, ScalarCallable)
+from loopy.diagnostic import LoopyError, DirectCallUncachedWarning
 from loopy.library.reduction import ReductionOpFunction
 
 from loopy.kernel import LoopKernel
-from loopy.tools import update_persistent_hash
+from loopy.target import TargetBase
 from pymbolic.primitives import Call
-from pyrsistent import pmap, PMap
+from immutables import Map
+
+if TYPE_CHECKING:
+    from loopy.target.execution import ExecutorBase
+
 
 __doc__ = """
 .. currentmodule:: loopy.translation_unit
@@ -127,7 +136,11 @@ def map_call_with_kwargs(self, expr):
 
 # {{{ translation unit
 
-class TranslationUnit(ImmutableRecord):
+FunctionIdT = Union[str, ReductionOpFunction]
+
+
+@dataclass(frozen=True)
+class TranslationUnit:
     """
     Records the information about all the callables in a :mod:`loopy` program.
 
@@ -162,6 +175,7 @@ class TranslationUnit(ImmutableRecord):
         TargetBase, function_indentifier: str)`` that returns an instance
         of :class:`loopy.kernel.function_interface.InKernelCallable` or *None*.
 
+    .. automethod:: executor
     .. automethod:: __call__
     .. automethod:: copy
     .. automethod:: __getitem__
@@ -176,47 +190,21 @@ class TranslationUnit(ImmutableRecord):
           :meth:`~TranslationUnit.copy`.
 
     """
-    def __init__(self,
-            entrypoints=frozenset(),
-            callables_table=None,
-            target=None,
-            func_id_to_in_knl_callable_mappers=None):
-
-        # {{{ sanity checks
-
-        if callables_table is None:
-            callables_table = pmap()
-        if func_id_to_in_knl_callable_mappers is None:
-            func_id_to_in_knl_callable_mappers = []
 
-        assert isinstance(callables_table, collections.abc.Mapping)
-        assert isinstance(entrypoints, frozenset)
+    callables_table: Map[FunctionIdT, CallableKernel]
+    target: TargetBase
+    entrypoints: FrozenSet[str]
 
-        if not isinstance(callables_table, PMap):
-            callables_table = pmap(callables_table)
+    def __post_init__(self):
 
-        # }}}
-
-        super().__init__(
-                entrypoints=entrypoints,
-                callables_table=pmap(callables_table),
-                target=target,
-                func_id_to_in_knl_callable_mappers=(
-                    func_id_to_in_knl_callable_mappers))
-
-        self._program_executor_cache = {}
-        self._hash_value = None
+        assert isinstance(self.entrypoints, abc_Set)
+        assert isinstance(self.callables_table, Map)
 
-    hash_fields = (
-            "entrypoints",
-            "callables_table",
-            "target",)
-
-    update_persistent_hash = update_persistent_hash
+        object.__setattr__(self, "_program_executor_cache", {})
 
     def copy(self, **kwargs):
         target = kwargs.pop("target", None)
-        program = super().copy(**kwargs)
+        program = replace(self, **kwargs)
         if target:
             from loopy.kernel import KernelState
             if max(callable_knl.subkernel.state
@@ -239,8 +227,8 @@ def copy(self, **kwargs):
                     raise NotImplementedError()
                 new_callables[func_id] = clbl
 
-            program = super().copy(
-                callables_table=new_callables, target=target)
+            program = replace(
+                    self, callables_table=Map(new_callables), target=target)
 
         return program
 
@@ -253,7 +241,7 @@ def with_entrypoints(self, entrypoints):
             entrypoints = frozenset([e.strip() for e in
                 entrypoints.split(",")])
 
-        assert isinstance(entrypoints, frozenset)
+        assert isinstance(entrypoints, abc_Set)
 
         return self.copy(entrypoints=entrypoints)
 
@@ -279,7 +267,7 @@ def with_kernel(self, kernel):
             # update the callable kernel
             new_in_knl_callable = self.callables_table[kernel.name].copy(
                     subkernel=kernel)
-            new_callables = self.callables_table.remove(kernel.name).set(
+            new_callables = self.callables_table.delete(kernel.name).set(
                     kernel.name, new_in_knl_callable)
             return self.copy(callables_table=new_callables)
         else:
@@ -310,6 +298,47 @@ def default_entrypoint(self):
                              " The default entrypoint kernel is not uniquely"
                              " determined.")
 
+    def executor(self,
+                 *args, entrypoint: Optional[str] = None, **kwargs) -> ExecutorBase:
+        """Return an object that hosts caches of compiled code for execution (i.e.
+        a subclass of :class:`ExecutorBase`, specific to an execution
+        environment (e.g. an OpenCL context) and a given entrypoint.
+
+        :arg entrypoint: The name of the entrypoint callable to be called.
+            Defaults to :attr:`default_entrypoint`.
+            An error will result if multiple entrypoints exist and no
+            entrypoint is specified.
+
+        The variable arguments to this are target-specific. The
+        :class:`PyOpenCLTarget` takes a :class:`~pyopencl.Context` or a
+        :class:`~pyopencl.CommandQueue`.
+        """
+        if entrypoint is None:
+            nentrypoints = len(self.entrypoints)
+            if nentrypoints == 1:
+                entrypoint, = self.entrypoints
+            elif nentrypoints > 1:
+                raise ValueError("TranslationUnit has multiple possible entrypoints."
+                                 " The default entrypoint kernel is not uniquely"
+                                 " determined. You may explicitly specify an "
+                                 " entrypoint using the 'entrypoint' kwarg.")
+            elif nentrypoints == 0:
+                raise ValueError("TranslationUnit has no entrypoints, but"
+                                 f" {len(self.callables_table)} callables."
+                                 " Use TranslationUnit.with_entrypoints to"
+                                 " set an entrypoint.")
+            else:
+                raise AssertionError
+        else:
+            if entrypoint not in self.entrypoints:
+                raise LoopyError(f"'{entrypoint}' not in list of possible "
+                        "entrypoints for the translation unit. "
+                        "Maybe you want to invoke 'with_entrypoints' before "
+                        "calling the translation unit?")
+
+        return self.target.get_kernel_executor(self, *args,
+                                               entrypoint=entrypoint, **kwargs)
+
     def __call__(self, *args, **kwargs):
         """
         Builds and calls the *entrypoint* kernel, if
@@ -317,8 +346,33 @@ def __call__(self, *args, **kwargs):
 
         :arg entrypoint: The name of the entrypoint callable to be called.
             Defaults to :attr:`default_entrypoint`.
+
+        .. warning::
+
+            While this was the main execution interface for loopy for many
+            years (and reasonably efficient), the caches that made this so
+            kept lots of expensive 'stuff' (such as OpenCL contexts) alive
+            for no good reason, leading to major inefficiencies.
+            See :meth:`executor` for an efficient, cached way to
+            invoke kernels.
         """
+
+        # The rationale for this is that the executor cache held long-lived
+        # references to OpenCL contexts, and translation units were kept alive
+        # long-term by caches, leading to many stale contexts being kept alive.
+        # While attempts were made to turn those into weak references, this was
+        # ultimately cumbersome and ineffective.
+        #
+        # In addition, the executor interface speeds up kernel invocation
+        # by removing one unnecessary layer of function call.
+        warn("TranslationUnit.__call__ will become uncached in 2024, "
+             "meaning it will incur possibly substantial compilation cost "
+             "with every invocation. Use TranslationUnit.executor to obtain "
+             "an object that holds longer-lived caches.",
+             DirectCallUncachedWarning, stacklevel=2)
+
         entrypoint = kwargs.get("entrypoint", None)
+
         if entrypoint is None:
             nentrypoints = len(self.entrypoints)
             if nentrypoints == 1:
@@ -346,38 +400,38 @@ def __call__(self, *args, **kwargs):
 
         key = self.target.get_kernel_executor_cache_key(*args, **kwargs)
         try:
-            pex = self._program_executor_cache[key]
+            pex = self._program_executor_cache[key]  # pylint: disable=no-member
         except KeyError:
             pex = self.target.get_kernel_executor(self, *args, **kwargs)
-            self._program_executor_cache[key] = pex
+            self._program_executor_cache[key] = pex  # pylint: disable=no-member
+
+        del kwargs["entrypoint"]
 
         return pex(*args, **kwargs)
 
     def __str__(self):
         # FIXME: do a topological sort by the call graph
 
-        def strify_callable(clbl):
-            return str(clbl.subkernel)
-
         return "\n".join(
-                strify_callable(clbl)
+                str(clbl.subkernel)
                 for name, clbl in self.callables_table.items()
                 if isinstance(clbl, CallableKernel))
 
-    def __setstate__(self, state_obj):
-        super().__setstate__(state_obj)
+    # FIXME: Delete these when _program_executor_cache leaves the building
+    def __getstate__(self):
+        from dataclasses import asdict
+        return asdict(self)
 
-        self._program_executor_cache = {}
+    def __setstate__(self, state_obj):
+        for k, v in state_obj.items():
+            object.__setattr__(self, k, v)
 
-    def __hash__(self):
-        # NOTE: _hash_value may vanish during pickling
-        if getattr(self, "_hash_value", None) is None:
-            from loopy.tools import LoopyKeyBuilder
-            key_hash = LoopyKeyBuilder.new_hash()
-            self.update_persistent_hash(key_hash, LoopyKeyBuilder())
-            self._hash_value = hash(key_hash.digest())
+        object.__setattr__(self, "_program_executor_cache", {})
 
-        return self._hash_value
+    # FIXME: This is here because Firedrake expects it, for some legacy reason.
+    # Without that, it would be safe to delete.
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.update_for_dataclass(key_hash, self)
 
 # }}}
 
@@ -455,7 +509,8 @@ def make_clbl_inf_ctx(callables, entrypoints):
     return CallablesInferenceContext(callables, name_gen)
 
 
-class CallablesInferenceContext(ImmutableRecord):
+@dataclass(frozen=True)
+class CallablesInferenceContext:
     """
     Helper class for housekeeping a :attr:`loopy.TranslationUnit.callables_table`
     while traversing through callables of :class:`loopy.TranslationUnit`.
@@ -480,18 +535,14 @@ class CallablesInferenceContext(ImmutableRecord):
 
     .. automethod:: __getitem__
     """
-    def __init__(self, callables,
-                 clbl_name_gen,
-                 renames=None,
-                 new_entrypoints=frozenset()):
-        if renames is None:
-            renames = collections.defaultdict(frozenset)
-        assert isinstance(callables, collections.abc.Mapping)
-
-        super().__init__(callables=dict(callables),
-                         clbl_name_gen=clbl_name_gen,
-                         renames=renames,
-                         new_entrypoints=new_entrypoints)
+    callables: Mapping[str, InKernelCallable]
+    clbl_name_gen: Callable[[str], str]
+    renames: Mapping[str, FrozenSet[str]] = field(
+            default_factory=lambda: collections.defaultdict(frozenset))
+    new_entrypoints: FrozenSet[str] = frozenset()
+
+    def copy(self, **kwargs: Any) -> CallablesInferenceContext:
+        return replace(self, **kwargs)
 
     def with_callable(self, old_function_id, new_clbl,
                       is_entrypoint=False):
@@ -515,7 +566,7 @@ def with_callable(self, old_function_id, new_clbl,
         if isinstance(old_function_id, Variable):
             old_function_id = old_function_id.name
 
-        renames = self.renames.copy()
+        renames = collections.defaultdict(frozenset, self.renames)
 
         # if the callable already exists => return the function
         # identifier corresponding to that callable.
@@ -556,7 +607,7 @@ def with_callable(self, old_function_id, new_clbl,
         # must allocate a new clbl in the namespace => find a unique id for it
         unique_function_id = self.clbl_name_gen(old_function_id)
 
-        updated_callables = self.callables.copy()
+        updated_callables = dict(self.callables)
         updated_callables[unique_function_id] = new_clbl
         renames[old_function_id] |= frozenset([unique_function_id])
 
@@ -642,7 +693,7 @@ def finish_program(self, program):
 
         # }}}
 
-        return program.copy(callables_table=new_callables)
+        return program.copy(callables_table=Map(new_callables))
 
     def __getitem__(self, name):
         result = self.callables[name]
@@ -653,18 +704,17 @@ def __getitem__(self, name):
 
 # {{{ helper functions
 
-def make_program(kernel):
+def make_program(kernel: LoopKernel) -> TranslationUnit:
     """
     Returns an instance of :class:`loopy.TranslationUnit` with *kernel* as the only
     callable kernel.
     """
 
-    program = TranslationUnit(
-            callables_table={
-                kernel.name: CallableKernel(kernel)},
-            target=kernel.target)
-
-    return program
+    return TranslationUnit(
+            callables_table=Map({
+                kernel.name: CallableKernel(kernel)}),
+            target=kernel.target,
+            entrypoints=frozenset())
 
 
 def for_each_kernel(transform):
@@ -696,7 +746,7 @@ def _collective_transform(*args, **kwargs):
 
                 new_callables[func_id] = clbl
 
-            return t_unit.copy(callables_table=new_callables)
+            return t_unit.copy(callables_table=Map(new_callables))
         else:
             assert isinstance(t_unit_or_kernel, LoopKernel)
             kernel = t_unit_or_kernel
@@ -791,7 +841,7 @@ def resolve_callables(program):
         else:
             raise NotImplementedError(f"{type(clbl)}")
 
-    program = program.copy(callables_table=callables_table)
+    program = program.copy(callables_table=Map(callables_table))
 
     validate_kernel_call_sites(program)
 
diff --git a/loopy/types.py b/loopy/types.py
index 57b9548bb..4c3b74ea6 100644
--- a/loopy/types.py
+++ b/loopy/types.py
@@ -22,10 +22,10 @@
 THE SOFTWARE.
 """
 
-from typing import Any, Mapping
-from warnings import warn
+from typing import Any, Mapping, Type, Union
 import numpy as np
 
+from loopy.typing import auto
 from loopy.diagnostic import LoopyError
 
 __doc__ = """
@@ -46,24 +46,24 @@ class LoopyType:
     Abstract class for dtypes of variables encountered in a
     :class:`loopy.LoopKernel`.
     """
-    def is_integral(self):
+    def is_integral(self) -> bool:
         raise NotImplementedError()
 
-    def is_complex(self):
+    def is_complex(self) -> bool:
         raise NotImplementedError()
 
-    def uses_complex(self):
+    def uses_complex(self) -> bool:
         raise NotImplementedError()
 
-    def is_composite(self):
+    def is_composite(self) -> bool:
         raise NotImplementedError()
 
     @property
-    def itemsize(self):
+    def itemsize(self) -> int:
         raise NotImplementedError()
 
     @property
-    def numpy_dtype(self):
+    def numpy_dtype(self) -> np.dtype:
         raise ValueError("'%s' is not a numpy type"
                 % str(self))
 
@@ -78,8 +78,8 @@ class AtomicType(LoopyType):
 # {{{ numpy-based dtype
 
 class NumpyType(LoopyType):
-    def __init__(self, dtype, target=None):
-        assert not isinstance(dtype, NumpyType)
+    def __init__(self, dtype: np.dtype):
+        assert not isinstance(dtype, LoopyType)
 
         if dtype is None:
             raise TypeError("may not pass None to construct NumpyType")
@@ -87,33 +87,27 @@ def __init__(self, dtype, target=None):
         if dtype == object:
             raise TypeError("loopy does not directly support object arrays")
 
-        if target is not None:
-            warn("Passing target is deprecated and will stop working in 2022.",
-                    DeprecationWarning, stacklevel=2)
-
         self.dtype = np.dtype(dtype)
 
-    def __hash__(self):
+    def __hash__(self) -> int:
         return hash(self.dtype)
 
     def update_persistent_hash(self, key_hash, key_builder):
         key_builder.rec(key_hash, self.dtype)
 
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> bool:
         return (
-                type(self) == type(other)
-                and self.dtype == other.dtype)
-
-    def __ne__(self, other):
-        return not self.__eq__(other)
+                type(self) is type(other)
+                # mypy doesn't understand 'type(self) is type(other)'
+                and self.dtype == other.dtype)  # type: ignore[attr-defined]
 
-    def is_integral(self):
+    def is_integral(self) -> bool:
         return self.dtype.kind in "iu"
 
-    def is_complex(self):
+    def is_complex(self) -> bool:
         return self.dtype.kind == "c"
 
-    def involves_complex(self):
+    def involves_complex(self) -> bool:
         def dtype_involves_complex(dtype):
             if dtype.kind == "c":
                 return True
@@ -131,14 +125,14 @@ def is_composite(self):
         return self.dtype.kind == "V"
 
     @property
-    def itemsize(self):
+    def itemsize(self) -> int:
         return self.dtype.itemsize
 
     @property
-    def numpy_dtype(self):
+    def numpy_dtype(self) -> np.dtype:
         return self.dtype
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return "np:" + repr(self.dtype)
 
 # }}}
@@ -171,43 +165,42 @@ class OpaqueType(LoopyType):
     through one ValueArg and go out to another. It is introduced to accomodate
     functional calls to external libraries.
     """
-    def __init__(self, name):
+    def __init__(self, name: str) -> None:
         assert isinstance(name, str)
         self.name = name
 
-    def is_integral(self):
+    def is_integral(self) -> bool:
         return False
 
-    def is_complex(self):
+    def is_complex(self) -> bool:
         return False
 
-    def involves_complex(self):
+    def involves_complex(self) -> bool:
         return False
 
     def update_persistent_hash(self, key_hash, key_builder):
         key_builder.rec(key_hash, self.name)
 
-    def __hash__(self):
+    def __hash__(self) -> int:
         return hash(self.name)
 
-    def __eq__(self, other):
+    def __eq__(self, other: object) -> bool:
         return (
-                type(self) == type(other)
-                and self.name == other.name)
-
-    def __ne__(self, other):
-        return not self.__eq__(other)
+                type(self) is type(other)
+                # mypy doesn't understand 'type(self) is type(other)'
+                and self.name == other.name  # type: ignore[attr-defined]
+                )
 
 # }}}
 
 
-def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False,
-        target=None):
-    if target is not None:
-        warn("Passing target is deprecated and will stop working in 2022.",
-                DeprecationWarning, stacklevel=2)
+ToLoopyTypeConvertible = Union[Type[auto], None, np.dtype, LoopyType]
+
 
-    from loopy.kernel.data import auto
+def to_loopy_type(dtype: ToLoopyTypeConvertible,
+                  allow_auto: bool = False, allow_none: bool = False,
+                  for_atomic: bool = False
+                  ) -> Union[Type[auto], None, LoopyType]:
     if dtype is None:
         if allow_none:
             return None
@@ -216,7 +209,8 @@ def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False,
 
     elif dtype is auto:
         if allow_auto:
-            return dtype
+            # mypy doesn't seem to catch that this narrows the type of dtype
+            return dtype  # type: ignore[return-value]
         else:
             raise LoopyError("dtype may not be auto")
 
@@ -224,7 +218,9 @@ def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False,
 
     if dtype is not None:
         try:
-            numpy_dtype = np.dtype(dtype)
+            # We're playing fast and loose here, and mypy is onto us. It has a
+            # point.
+            numpy_dtype = np.dtype(dtype)  # type: ignore
         except Exception:
             pass
 
diff --git a/loopy/typing.py b/loopy/typing.py
index d6714d870..5a20d2e0b 100644
--- a/loopy/typing.py
+++ b/loopy/typing.py
@@ -21,7 +21,7 @@
 """
 
 
-from typing import Union, Tuple
+from typing import Union, Tuple, TypeVar, Optional
 
 import numpy as np
 
@@ -36,3 +36,18 @@
 ExpressionT = Union[IntegralT, FloatT, Expression]
 ShapeType = Tuple[ExpressionT, ...]
 StridesType = ShapeType
+
+
+class auto:  # noqa
+    """A generic placeholder object for something that should be automatically
+    determined.  See, for example, the *shape* or *strides* argument of
+    :class:`ArrayArg`.
+    """
+
+
+T = TypeVar("T")
+
+
+def not_none(obj: Optional[T]) -> T:
+    assert obj is not None
+    return obj
diff --git a/loopy/version.py b/loopy/version.py
index 5372b5935..f66c24dee 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -42,7 +42,7 @@
 # }}}
 
 
-VERSION = (2022, 1)
+VERSION = (2024, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
 
diff --git a/setup.cfg b/setup.cfg
index e3a8cc8bd..822df80d7 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -18,6 +18,7 @@ per-file-ignores =
 
 [tool:pytest]
 doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS
+addopts = --ignore=proto-tests --ignore=loopy/target/c/compyte/ndarray
 
 [mypy]
 python_version = 3.8
diff --git a/setup.py b/setup.py
index a14cee92c..d9b8f6643 100644
--- a/setup.py
+++ b/setup.py
@@ -84,7 +84,7 @@ def write_git_revision(package_name):
 
       python_requires="~=3.8",
       install_requires=[
-          "pytools>=2022.1.7",
+          "pytools>=2023.1.1",
           "pymbolic>=2022.1",
           "genpy>=2016.1.2",
 
@@ -98,6 +98,7 @@ def write_git_revision(package_name):
           "Mako",
           "pyrsistent",
           "immutables",
+          "typing_extensions",
           ],
 
       extras_require={
diff --git a/test/test_c_execution.py b/test/test_c_execution.py
index 4b5fdb1f8..11e336e0d 100644
--- a/test/test_c_execution.py
+++ b/test/test_c_execution.py
@@ -336,11 +336,6 @@ def eval_tester(knl):
         # the default (non-guessed) toolchain!
         __test(eval_tester, ExecutableCTarget, compiler=ccomp)
 
-    # and test that we will fail if we remove a required attribute
-    del ccomp.toolchain.undefines
-    with pytest.raises(AttributeError):
-        __test(eval_tester, ExecutableCTarget, compiler=ccomp)
-
     # next test that some made up compiler can be specified
     ccomp = CCompiler(cc="foo")
     assert isinstance(ccomp.toolchain, GCCToolchain)
diff --git a/test/test_einsum.py b/test/test_einsum.py
index bada5c8c9..c3ed4ec98 100644
--- a/test/test_einsum.py
+++ b/test/test_einsum.py
@@ -26,6 +26,7 @@
 import loopy as lp
 import numpy as np
 import pyopencl as cl
+import pyopencl.array
 
 from pyopencl.tools import \
     pytest_generate_tests_for_pyopencl as pytest_generate_tests  # noqa
@@ -140,6 +141,28 @@ def test_einsum_array_ops_triple_prod(ctx_factory, spec):
     assert np.linalg.norm(out - ans) <= 1e-15
 
 
+def test_einsum_with_variable_strides(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    spec = "ijk,jl->il"
+    knl = lp.make_einsum(spec, ("a", "b"),
+                         default_order=lp.auto, default_offset=lp.auto)
+
+    a_untransposed = np.random.randn(3, 5, 4)
+    b = np.random.randn(4, 5)
+
+    a = a_untransposed.transpose((0, 2, 1))
+    a_dev = cl.array.to_device(queue, a_untransposed).transpose((0, 2, 1))
+    assert a_dev.strides == a.strides
+
+    _evt, (result,) = knl(queue, a=a_dev, b=b)
+
+    ref = np.einsum(spec, a, b)
+
+    assert np.allclose(result.get(), ref)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 6a4079f00..2d1c7bc22 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1768,7 +1768,8 @@ def test_ilp_and_conditionals(ctx_factory):
     lp.auto_test_vs_ref(ref_knl, ctx, knl)
 
 
-def test_unr_and_conditionals(ctx_factory):
+@pytest.mark.parametrize("unr_tag", ["unr", "unr_hint"])
+def test_unr_and_conditionals(ctx_factory, unr_tag):
     ctx = ctx_factory()
 
     knl = lp.make_kernel("{[k]: 0<=k<n}}",
@@ -1786,7 +1787,7 @@ def test_unr_and_conditionals(ctx_factory):
 
     ref_knl = knl
 
-    knl = lp.split_iname(knl, "k", 2, inner_tag="unr")
+    knl = lp.split_iname(knl, "k", 2, inner_tag=unr_tag)
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl)
 
@@ -3563,7 +3564,7 @@ def test_no_barrier_err_for_global_temps_with_base_storage(ctx_factory):
     np.testing.assert_allclose(2*np.arange(16) + 2, out)
 
 
-def test_dgemm_with_rectangular_tile_prefetch(ctx_factory):
+def test_dgemm_with_rectangular_tile_prefetch():
     # See <https://github.com/inducer/loopy/issues/724>
     t_unit = lp.make_kernel(
         "{[i,j,k]: 0<=i,j<72 and 0<=k<32}",
@@ -3609,6 +3610,64 @@ def test_dgemm_with_rectangular_tile_prefetch(ctx_factory):
     lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit)
 
 
+def test_modulo_vs_type_context(ctx_factory):
+    t_unit = lp.make_kernel(
+            "{[i]: 0 <= i < 10}",
+            """
+            # previously, the float 'type context' would propagate into
+            # the remainder, leading to 'i % 10.0' being generated, which
+            # C/OpenCL did not like.
+            <float64> a = i % 10
+            """)
+
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    t_unit(queue)
+
+
+def test_barrier_non_zero_hw_lbound():
+    t_unit = lp.make_kernel(
+        ["{[i]: 1<=i<17}",
+         "{[j]: 0<=j<16}"],
+        """
+        <> a[i] = i      {id=w_a}
+        <> b[j] = 2*a[j] {id=w_b}
+        """)
+
+    t_unit = lp.tag_inames(t_unit, {"i": "l.0", "j": "l.0"})
+
+    t_unit = lp.preprocess_kernel(t_unit)
+    knl = lp.get_one_linearized_kernel(t_unit.default_entrypoint,
+                                       t_unit.callables_table)
+
+    assert barrier_between(knl, "w_a", "w_b")
+
+
+def test_no_unnecessary_lbarrier(ctx_factory):
+    # This regression would fail on loopy.git <= 268a7f4
+    # (Issue reported by @thilinarmtb)
+
+    t_unit = lp.make_kernel(
+        "{[i_outer, i_inner]: 0 <= i_outer < n and 0 <= i_inner < 16}",
+        """
+        <> s_a[i_inner] = ai[i_outer * 16 + i_inner] {id=write_s_a}
+        ao[i_outer * 16 + i_inner] = 2.0 * s_a[i_inner] {id=write_ao, dep=write_s_a}
+        """,
+        assumptions="n>=0")
+
+    t_unit = lp.add_dtypes(t_unit, dict(ai=np.float32))
+    t_unit = lp.tag_inames(t_unit, dict(i_inner="l.0", i_outer="g.0"))
+    t_unit = lp.set_temporary_address_space(t_unit, "s_a", "local")
+    t_unit = lp.prioritize_loops(t_unit, "i_outer,i_inner")
+
+    t_unit = lp.preprocess_kernel(t_unit)
+    knl = lp.get_one_linearized_kernel(t_unit.default_entrypoint,
+                                       t_unit.callables_table)
+
+    assert not barrier_between(knl, "write_s_a", "write_ao")
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_target.py b/test/test_target.py
index 389c865b6..10a04ed5b 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -400,7 +400,7 @@ def test_nan_support(ctx_factory, target):
          lp.Assignment(parse("c"), parse("isnan(3.14)")),
          lp.Assignment(parse("d"), parse("isnan(0.0)")),
          lp.Assignment(parse("e"), NaN(np.float32)),
-         lp.Assignment(parse("f"), Variable("isnan")(NaN())),
+         lp.Assignment(parse("f"), Variable("isnan")(NaN(None))),
          lp.Assignment(parse("g"), NaN(np.complex64)),
          lp.Assignment(parse("h"), NaN(np.complex128)),
          ],
@@ -560,22 +560,6 @@ def test_input_args_are_required(ctx_factory):
         _ = knl(queue)
 
 
-def test_pyopencl_execution_accepts_device_scalars(ctx_factory):
-    import pyopencl.array as cla
-
-    ctx = ctx_factory()
-    cq = cl.CommandQueue(ctx)
-
-    knl = lp.make_kernel("{:}",
-                         """
-                         y = 2*x
-                         """)
-
-    evt, (out,) = knl(cq, x=cla.to_device(cq, np.asarray(21)))
-
-    np.testing.assert_allclose(out.get(), 42)
-
-
 def test_pyopencl_target_with_global_temps_with_base_storage(ctx_factory):
     from pyopencl.tools import ImmediateAllocator
 
@@ -640,7 +624,7 @@ def test_glibc_bessel_functions(dtype):
         second_kind_bessel[i] = bessel_yn(n, x[i])
         """, target=lp.ExecutableCWithGNULibcTarget(compiler))
 
-    if knl.target.compiler.toolchain.cc not in ["gcc", "g++"]:
+    if knl.target.compiler.toolchain.cc not in ["gcc", "g++"]:  # pylint: disable=no-member  # noqa: E501
         pytest.skip("GNU-libc not found.")
 
     knl = lp.fix_parameters(knl, n=2)
diff --git a/test/test_transform.py b/test/test_transform.py
index 1b56344e7..5ca01dea0 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -1668,6 +1668,38 @@ def test_remove_predicates_from_insn():
     assert t_unit == ref_t_unit
 
 
+def test_precompute_lets_length1_inames_live_if_requested():
+    t_unit = lp.make_kernel(
+            "{[e,i]: 0<=e<1 and 0<=i<10}",
+            """
+            v(e, i) := e + i
+            out[e, i] = v(e, i)
+            """)
+
+    t_unit = lp.precompute(t_unit, "v", "i", _enable_mirgecom_workaround=True)
+
+    from pymbolic import parse
+    assert t_unit.default_entrypoint.id_to_insn["v"].expression == parse("e + i_0")
+
+
+def test_precompute_lets_inner_length1_inames_live():
+    t_unit = lp.make_kernel(
+            "{[e,i]: 0<=e<1 and 0<=i<10}",
+            """
+            v(e, i) := e / i
+            #v(eee, i) := eee + i
+            out[e, i] = v(e, i)
+            """)
+
+    t_unit = lp.split_iname(t_unit, "e", 16)
+    t_unit = lp.precompute(t_unit, "v", "i", _enable_mirgecom_workaround=True)
+
+    from pymbolic import parse
+    assert (
+            t_unit.default_entrypoint.id_to_insn["v"].expression
+            == parse("(e_inner + e_outer*16) / i_0"))
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])