diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6c47aa63f..2b4572611 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -141,7 +141,7 @@ jobs: downstream_tests: strategy: matrix: - downstream_project: [meshmode, grudge, pytential, pytato] + downstream_project: [arraycontext, meshmode, grudge, pytential, pytato] fail-fast: false name: Tests for downstream project ${{ matrix.downstream_project }} runs-on: ubuntu-latest @@ -190,4 +190,14 @@ jobs: pytest --tb=native -rsxw --durations=10 -m 'not parallel' tests/multigrid/ + validate_cff: + name: Validate CITATION.cff + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + - run: | + pip install cffconvert + cffconvert -i CITATION.cff --validate + # vim: sw=4 diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d94a9f484..3b9817a2b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,9 +1,4 @@ -stages: - - test - - deploy - Pytest POCL: - stage: test script: - export PYOPENCL_TEST=portable:pthread - export EXTRA_INSTALL="pybind11 numpy mako" @@ -20,7 +15,6 @@ Pytest POCL: junit: test/pytest.xml Pytest Nvidia Titan V: - stage: test script: - export PYOPENCL_TEST=nvi:titan - export EXTRA_INSTALL="pybind11 numpy mako" @@ -38,7 +32,6 @@ Pytest Nvidia Titan V: junit: test/pytest.xml Pytest POCL without arg check: - stage: test script: - export PYOPENCL_TEST=portable:pthread - export EXTRA_INSTALL="pybind11 numpy mako" @@ -56,7 +49,6 @@ Pytest POCL without arg check: junit: test/pytest.xml Pytest Intel: - stage: test script: - export PYOPENCL_TEST=intel - export EXTRA_INSTALL="pybind11 numpy mako" @@ -76,7 +68,6 @@ Pytest Intel: Pytest POCL Twice With Cache: - stage: test script: | export PYOPENCL_TEST=portable:pthread export EXTRA_INSTALL="pybind11 numpy mako" @@ -109,7 +100,6 @@ Pytest POCL Twice With Cache: # - tags Pytest POCL Examples: - stage: test script: | export PYOPENCL_TEST=portable:pthread export EXTRA_INSTALL="pybind11 numpy mako" @@ -134,7 +124,6 @@ Pytest POCL Examples: - tags Pylint: - stage: test script: # Needed to avoid name shadowing issues when running from source directory. - PROJECT_INSTALL_FLAGS="--editable" @@ -147,7 +136,6 @@ Pylint: - tags Documentation: - stage: deploy script: | EXTRA_INSTALL="pybind11 numpy" curl -L -O https://tiker.net/ci-support-v0 @@ -160,7 +148,6 @@ Documentation: - python3 Flake8: - stage: test script: - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test examples @@ -182,26 +169,10 @@ Mypy: except: - tags -Benchmarks: - stage: test - script: - - CONDA_ENVIRONMENT=.test-conda-env-py3.yml - - PROJECT=loopy - - PYOPENCL_TEST=portable:pthread - - export LOOPY_NO_CACHE=1 - - export ASV_FACTOR=1.5 - - curl -L -O https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-benchmark-py-project.sh - - ". ./build-and-benchmark-py-project.sh" - tags: - - linux - - benchmark - except: - - tags - Downstream: parallel: matrix: - - DOWNSTREAM_PROJECT: [meshmode, grudge, pytential, pytato] + - DOWNSTREAM_PROJECT: [arraycontext, meshmode, grudge, pytential, pytato] tags: - large-node - "docker-runner" diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 000000000..31bef5e73 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,103 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: + +# major contributors + +- family-names: "Kloeckner" + given-names: "Andreas" + orcid: "https://orcid.org/0000-0003-1228-519X" +- family-names: Kulkarni + given-names: Kaushik + email: kaushikcfd@gmail.com +- family-names: Kempf + given-names: Dominic + email: dominic.r.kempf@gmail.com +- family-names: Wala + given-names: Matt + email: wala1@illinois.edu +- family-names: Curtis + given-names: Nick + email: arghdos@gmail.com +- family-names: Stevens + given-names: James + email: jdsteve2@illinois.edu +- family-names: Fernando + given-names: Isuru + email: isuruf@gmail.com + +# smaller fixes + +- family-names: Mitchell + given-names: Lawrence + email: lawrence@wence.uk +- family-names: Alvey-Blanco + given-names: Addison J. + email: aalveyblanco@gmail.com +- family-names: Fikl + given-names: Alexandru + email: alexfikl@gmail.com +- family-names: Malone + given-names: Chris + email: chris.m.malone@gmail.com +- family-names: Ward + given-names: Connor + email: c.ward20@imperial.ac.uk +- family-names: Wilcox + given-names: Lucas C. + email: lucas@swirlee.com +- family-names: Koch + given-names: Marcel + email: marcel.koch@uni-muenster.de +- family-names: Woodman + given-names: Marmaduke + email: marmaduke.woodman@univ-amu.fr +- family-names: Smith + given-names: Matthew + email: mjsmith6@illinois.edu +- family-names: Diener + given-names: Matthias + email: mdiener@illinois.edu +- family-names: Christensen + given-names: Nicholas + email: njchris2@illinois.edu +- family-names: Nykto + given-names: Nicolas + email: nnytko2@illinois.edu +- family-names: Kirby + given-names: Robert C. + email: Robert_Kirby@baylor.edu +- family-names: Hegmann + given-names: Sebastian + email: shegmann@nina.iwr.uni-heidelberg.de +- family-names: Vorderwuelbecke + given-names: Sophia + email: sv2518@ic.ac.uk +- family-names: Ratnayaka + given-names: Thilina + email: thilinarmtb@gmail.com +- family-names: Gibson + given-names: Thomas + email: gibsonthomas1120@hotmail.com +- family-names: Sun + given-names: Tianjiao + email: tj-sun@tianjiaos-air.home +- family-names: Smith + given-names: Timothy A. + email: tasmith4@illinois.edu +- family-names: Warburton + given-names: Tim + email: timwar@caam.rice.edu +- family-names: Wei + given-names: Xiaoyu + email: wxy0516@gmail.com +- family-names: Weiner + given-names: Zach + email: zachjweiner@gmail.com + +title: "Loopy" +version: 2024.1 +date-released: 2024-02-16 +url: "https://github.com/inducer/loopy" +doi: 10.5281/zenodo.10672275 +license: MIT diff --git a/MANIFEST.in b/MANIFEST.in index 293d43ffc..a87cfef7d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -19,4 +19,5 @@ include configure.py include Makefile.in include README.rst include LICENSE +include CITATION.cff include requirements*.txt diff --git a/README.rst b/README.rst index dd9c8cf76..1ef7773db 100644 --- a/README.rst +++ b/README.rst @@ -10,6 +10,9 @@ Loopy: Transformation-Based Generation of High-Performance CPU/GPU Code .. image:: https://badge.fury.io/py/loopy.png :alt: Python Package Index Release Page :target: https://pypi.org/project/loopy/ +.. image:: https://zenodo.org/badge/20281732.svg + :alt: Zenodo DOI for latest release + :target: https://zenodo.org/doi/10.5281/zenodo.10672274 Loopy lets you easily generate the tedious, complicated code that is necessary to get good performance out of GPUs and multi-core CPUs. diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/benchmarks/run_sumpy_kernels.py b/benchmarks/run_sumpy_kernels.py deleted file mode 100644 index 72c61a539..000000000 --- a/benchmarks/run_sumpy_kernels.py +++ /dev/null @@ -1,130 +0,0 @@ -import loopy as lp -import numpy as np -import pyopencl as cl -import logging -from dataclasses import dataclass -import time - -logger = logging.getLogger(__name__) - -from pyopencl.tools import ( # noqa - pytest_generate_tests_for_pyopencl as pytest_generate_tests, -) - - -def _sumpy_kernel_init(param): - name, dim, order = param.name, param.dim, param.order - # TODO: add other kernels - assert name == "m2l" - from sumpy.expansion.multipole import ( - LinearPDEConformingVolumeTaylorMultipoleExpansion, - ) - from sumpy.expansion.local import LinearPDEConformingVolumeTaylorLocalExpansion - from sumpy.kernel import LaplaceKernel - from sumpy import E2EFromCSR - - ctx = cl.create_some_context() - np.random.seed(17) - - knl = LaplaceKernel(dim) - local_expn_class = LinearPDEConformingVolumeTaylorLocalExpansion - mpole_expn_class = LinearPDEConformingVolumeTaylorMultipoleExpansion - m_expn = mpole_expn_class(knl, order=order) - l_expn = local_expn_class(knl, order=order) - - m2l = E2EFromCSR(ctx, m_expn, l_expn, name="loopy_kernel") - m2l.get_translation_loopy_insns() - m2l.ctx = None - m2l.device = None - return m2l - - -def _sumpy_kernel_make(expn, param): - assert param.name == "m2l" - loopy_knl = expn.get_optimized_kernel() - loopy_knl = lp.add_and_infer_dtypes( - loopy_knl, - dict( - tgt_ibox=np.int32, - centers=np.float64, - tgt_center=np.float64, - target_boxes=np.int32, - src_ibox=np.int32, - src_expansions=np.float64, - tgt_rscale=np.float64, - src_rscale=np.float64, - src_box_starts=np.int32, - src_box_lists=np.int32, - ), - ) - return loopy_knl - - -@dataclass(frozen=True) -class Param: - name: str - dim: int - order: int - - -def cached_data(params): - data = {} - np.random.seed(17) - logging.basicConfig(level=logging.INFO) - for param in params: - data[param] = {} - expn = _sumpy_kernel_init(param) - data[param]["setup"] = expn - knl = _sumpy_kernel_make(expn, param) - knl = lp.preprocess_kernel(knl) - data[param]["instantiated"] = knl - scheduled = knl.with_kernel(lp.get_one_scheduled_kernel(knl["loopy_kernel"], - knl.callables_table)) - data[param]["scheduled"] = scheduled - return data - - -class SumpyBenchmarkSuite: - - params = [ - Param("m2l", dim=3, order=6), - Param("m2l", dim=3, order=12), - ] - - param_names = ["test_name"] - - version = 1 - - def setup_cache(self): - return cached_data(self.params) - - def time_instantiate(self, data, param): - knl = _sumpy_kernel_make(data[param]["setup"], param) - lp.preprocess_kernel(knl) - - def time_schedule(self, data, param): - knl = data[param]["instantiated"] - knl.with_kernel(lp.get_one_scheduled_kernel(knl["loopy_kernel"], - knl.callables_table)) - - def time_generate_code(self, data, param): - lp.generate_code_v2(data[param]["scheduled"]) - - time_instantiate.timeout = 600.0 - time_schedule.timeout = 600.0 - time_generate_code.timeout = 600.0 - - # Use CPU time as the timer - time_instantiate.timer = time.process_time - time_schedule.timer = time.process_time - time_generate_code.timer = time.process_time - - # No warmup is needed - time_instantiate.warmup_time = 0 - time_schedule.warmup_time = 0 - time_generate_code.warmup_time = 0 - - # Run memory benchmarks as well - peakmem_instantiate = time_instantiate - peakmem_schedule = time_schedule - peakmem_generate_code = time_generate_code diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index c53c56530..1fa237b25 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -200,6 +200,8 @@ Tag Meaning ``"l.N"`` Local (intra-group) axis N ("local") ``"g.N"`` Group-number axis N ("group") ``"unr"`` Unroll +``"unr_hint"`` Unroll using compiler directives +``"unr_hint.N"`` Unroll at most N times using compiler directives ``"ilp"`` | ``"ilp.unr"`` Unroll using instruction-level parallelism ``"ilp.seq"`` Realize parallel iname as innermost loop ``"like.INAME"`` Can be used when tagging inames to tag like another @@ -535,8 +537,6 @@ have the lifetime of a kernel invocation. .. autoclass:: AddressSpace .. autoclass:: TemporaryVariable - :members: - :undoc-members: .. _types: diff --git a/doc/ref_other.rst b/doc/ref_other.rst index b13f39869..8ce3be0ca 100644 --- a/doc/ref_other.rst +++ b/doc/ref_other.rst @@ -16,10 +16,11 @@ Controlling caching Running Kernels --------------- -In addition to simply calling kernels using :meth:`LoopKernel.__call__`, -the following underlying functionality may be used: +Use :class:`TranslationUnit.executor` to bind a translation unit +to execution resources, and then use :class:`ExecutorBase.__call__` +to invoke the kernel. -.. autoclass:: CompiledKernel +.. autoclass:: ExecutorBase Automatic Testing ----------------- diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 8e65e4591..617aef6f8 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -29,6 +29,8 @@ import a few modules and set up a :class:`pyopencl.Context` and a >>> from warnings import filterwarnings, catch_warnings >>> filterwarnings('error', category=lp.LoopyWarning) + >>> from loopy.diagnostic import DirectCallUncachedWarning + >>> filterwarnings('ignore', category=DirectCallUncachedWarning) >>> ctx = cl.create_some_context(interactive=False) >>> queue = cl.CommandQueue(ctx) @@ -1057,7 +1059,6 @@ earlier: acc_k = 0.0f; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) a_fetch[lid(0)] = a[16 * gid(0) + lid(0)]; - barrier(CLK_LOCAL_MEM_FENCE) /* for a_fetch (insn_k_update depends on a_fetch_rule) */; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) { for (int k = 0; k <= 15; ++k) @@ -1466,7 +1467,7 @@ We'll also request a prefetch--but suppose we only do so across the .. doctest:: - >>> knl = lp.add_prefetch(knl, "a", "i_inner") + >>> knl = lp.add_prefetch(knl, "a", "i_inner", default_tag="l.auto") When we try to run our code, we get the following warning from loopy as a first sign that something is amiss: diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py index 3458a6e0e..486536cc0 100644 --- a/examples/python/hello-loopy.py +++ b/examples/python/hello-loopy.py @@ -24,7 +24,11 @@ # execute # ------- +# easy, slower: evt, (out,) = knl(queue, a=a) +# efficient, with caching: +knl_ex = knl.executor(ctx) +evt, (out,) = knl_ex(queue, a=a) # ENDEXAMPLE knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.float32)}) diff --git a/loopy/__init__.py b/loopy/__init__.py index 4796c1f59..e5aa4259a 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -28,6 +28,7 @@ # {{{ imported user interface +from loopy.typing import auto from loopy.kernel.instruction import ( LegacyStringInstructionTag, UseStreamingStoreTag, MemoryOrdering, @@ -37,7 +38,6 @@ MultiAssignmentBase, Assignment, CallInstruction, CInstruction, NoOpInstruction, BarrierInstruction) from loopy.kernel.data import ( - auto, KernelArgument, ValueArg, ArrayArg, GlobalArg, ConstantArg, ImageArg, AddressSpace, @@ -147,7 +147,6 @@ from loopy.codegen.result import ( GeneratedProgram, CodeGenerationResult) -from loopy.compiled import CompiledKernel from loopy.options import Options from loopy.auto_test import auto_test_vs_ref from loopy.frontend.fortran import (c_preprocess, parse_transformed_fortran, @@ -162,7 +161,10 @@ from loopy.target.pyopencl import PyOpenCLTarget from loopy.target.ispc import ISPCTarget -from loopy.tools import Optional, t_unit_to_python, memoize_on_disk +from loopy.tools import (Optional, t_unit_to_python, memoize_on_disk, + clear_in_mem_caches) + +from loopy.target.execution import ExecutorBase __all__ = [ @@ -293,8 +295,6 @@ "gather_access_footprints", "gather_access_footprint_bytes", "Sync", - "CompiledKernel", - "auto_test_vs_ref", "Options", @@ -311,7 +311,9 @@ "PyOpenCLTarget", "ISPCTarget", "ASTBuilderBase", - "Optional", "memoize_on_disk", + "Optional", "memoize_on_disk", "clear_in_mem_caches", + + "ExecutorBase", # {{{ from this file diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 01e84b4a5..32f89992a 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -34,7 +34,7 @@ from pytools import UniqueNameGenerator from pytools.persistent_dict import WriteOncePersistentDict -from loopy.tools import LoopyKeyBuilder +from loopy.tools import LoopyKeyBuilder, caches from loopy.version import DATA_MODEL_VERSION from loopy.types import LoopyType from loopy.typing import ExpressionT @@ -315,6 +315,9 @@ def ast_builder(self): key_builder=LoopyKeyBuilder()) +caches.append(code_gen_cache) + + class InKernelCallablesCollector(CombineMapper): """ Returns an instance of :class:`frozenset` containing instances of @@ -493,7 +496,7 @@ def diverge_callee_entrypoints(program): new_callables[name] = clbl - return program.copy(callables_table=new_callables) + return program.copy(callables_table=Map(new_callables)) @dataclass(frozen=True) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 349f82ebd..29a7d6d72 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -24,6 +24,7 @@ """ import islpy as isl +from functools import partial from loopy.codegen.result import merge_codegen_results, wrap_in_if from loopy.schedule import ( @@ -72,7 +73,7 @@ def generate_code_for_sched_index(codegen_state, sched_index): elif isinstance(sched_item, EnterLoop): from loopy.kernel.data import (UnrolledIlpTag, UnrollTag, ForceSequentialTag, LoopedIlpTag, VectorizeTag, - InameImplementationTag, + InameImplementationTag, UnrollHintTag, InOrderSequentialSequentialTag, filter_iname_tags_by_type) tags = kernel.iname_tags_of_type(sched_item.iname, InameImplementationTag) @@ -87,9 +88,14 @@ def generate_code_for_sched_index(codegen_state, sched_index): func = generate_unroll_loop elif filter_iname_tags_by_type(tags, VectorizeTag): func = generate_vectorize_loop + elif filter_iname_tags_by_type(tags, UnrollHintTag): + unroll_tags = filter_iname_tags_by_type(tags, UnrollHintTag) + hints = [codegen_state.ast_builder.emit_unroll_hint(tag.value) + for tag in unroll_tags] + func = partial(generate_sequential_loop_dim_code, hints=hints) elif not tags or filter_iname_tags_by_type(tags, (LoopedIlpTag, ForceSequentialTag, InOrderSequentialSequentialTag)): - func = generate_sequential_loop_dim_code + func = partial(generate_sequential_loop_dim_code, hints=[]) else: raise RuntimeError("encountered (invalid) EnterLoop " "for '%s', tagged '%s'" diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 2dfb532f2..d76ffc121 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -287,14 +287,12 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, result = [] - bounds = kernel.get_iname_bounds(iname) domain = kernel.get_inames_domain(iname) # It's ok to find a bound that's too "loose". The conditional # generators will mop up after us. - from loopy.isl_helpers import static_min_of_pw_aff - lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, - constants_only=False) + from loopy.kernel.tools import get_hw_axis_base_for_codegen + lower_bound = get_hw_axis_base_for_codegen(kernel, iname) # These bounds are 'implemented' by the hardware. Make sure # that the downstream conditional generators realize that. @@ -345,7 +343,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, # {{{ sequential loop -def generate_sequential_loop_dim_code(codegen_state, sched_index): +def generate_sequential_loop_dim_code(codegen_state, sched_index, hints): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper @@ -479,7 +477,7 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index): codegen_state, loop_iname, kernel.index_dtype, pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)), pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)), - inner_ast))) + inner_ast, hints))) return merge_codegen_results(codegen_state, result) diff --git a/loopy/compiled.py b/loopy/compiled.py deleted file mode 100644 index 0fa18eacb..000000000 --- a/loopy/compiled.py +++ /dev/null @@ -1,41 +0,0 @@ -__copyright__ = "Copyright (C) 2016 Andreas Kloeckner" - -__license__ = """ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -""" - - -from loopy.target.pyopencl_execution import ( # noqa - PyOpenCLKernelExecutor) - - -# {{{ compatibility - -class CompiledKernel(PyOpenCLKernelExecutor): - """ - .. automethod:: __call__ - """ - def __init__(self, context, kernel, entrypoint): - from warnings import warn - warn("CompiledKernel is deprecated. Use LoopKernel.__call__ directly.", - DeprecationWarning, stacklevel=2) - - super().__init__(context, kernel, entrypoint) - -# }}} diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index c81d38c34..e7a93cd11 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -45,6 +45,10 @@ class ParameterFinderWarning(LoopyWarning): class WriteRaceConditionWarning(LoopyWarning): pass + +class DirectCallUncachedWarning(LoopyWarning): + pass + # }}} diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py index b9a0aa27d..c23563c83 100644 --- a/loopy/frontend/fortran/translator.py +++ b/loopy/frontend/fortran/translator.py @@ -23,6 +23,7 @@ import re from sys import intern +from immutables import Map import loopy as lp import numpy as np @@ -325,7 +326,7 @@ def specialize_fortran_division(t_unit): new_callables[name] = clbl - return t_unit.copy(callables_table=new_callables) + return t_unit.copy(callables_table=Map(new_callables)) # }}} diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 72b842dbb..59716edb8 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -52,12 +52,6 @@ def pw_aff_to_aff(pw_aff): return pieces[0][1] -def dump_space(ls): - return " ".join( - "%s: %d" % (dim_type.find_value(dt), ls.dim(dt)) - for dt in range(1 + dim_type.all)) - - # {{{ make_slab def make_slab(space, iname, start, stop, iname_multiplier=1): diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index fb318bc93..d3b6ec0ea 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -60,6 +60,7 @@ from loopy.kernel.function_interface import InKernelCallable from loopy.codegen import PreambleInfo + # {{{ loop kernel object class KernelState(IntEnum): # noqa diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index dd182211b..165727e6b 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -654,9 +654,6 @@ class ArrayBase(ImmutableRecord, Taggable): * a pymbolic expression * :class:`loopy.auto`, in which case an offset argument is added automatically, immediately following this argument. - :class:`loopy.CompiledKernel` is even smarter in its treatment of - this case and will compile custom versions of the kernel based on - whether the passed arrays have offsets or not. .. attribute:: dim_names @@ -844,6 +841,8 @@ def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0, n_axes=num_user_axes, use_increasing_target_axes=self.max_target_axes > 1, dim_names=dim_names) + + if dim_tags is not None: order = None # }}} @@ -921,7 +920,7 @@ def __eq__(self, other): is_tuple_of_expressions_equal as istoee, is_expression_equal as isee) return ( - type(self) == type(other) + type(self) is type(other) and self.name == other.name and self.dtype == other.dtype and istoee(self.shape, other.shape) diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py index 8dcdbe3c1..4134b11ce 100644 --- a/loopy/kernel/creation.py +++ b/loopy/kernel/creation.py @@ -27,13 +27,14 @@ from pymbolic.mapper import CSECachingMapperMixin from pymbolic.primitives import Slice, Variable, Subscript, Call +from loopy.kernel.array import FixedStrideArrayDimTag from loopy.tools import intern_frozenset_of_ids, Optional from loopy.symbolic import ( IdentityMapper, WalkMapper, SubArrayRef) from loopy.kernel.data import ( InstructionBase, MultiAssignmentBase, Assignment, - SubstitutionRule, AddressSpace, ValueArg) + SubstitutionRule, AddressSpace, ValueArg, auto) from loopy.translation_unit import for_each_kernel from loopy.diagnostic import LoopyError, warn_with_kernel import islpy as isl @@ -41,7 +42,6 @@ from pytools import ProcessLogger from sys import intern -import loopy.version import re @@ -992,6 +992,9 @@ def intern_if_str(s): subst_match = SUBST_RE.match(insn) if subst_match is not None: subst = parse_subst_rule(subst_match.groupdict()) + if subst.name in substitutions: + raise LoopyError("attempt to redefine substitution rule " + f"'{subst.name}'") substitutions[subst.name] = subst continue @@ -1732,8 +1735,30 @@ def apply_default_order_to_args(kernel, default_order): processed_args = [] for arg in kernel.args: - if isinstance(arg, ArrayBase) and arg.order is None: - arg = arg.copy(order=default_order) + if isinstance(arg, ArrayBase): + if default_order in ["c", "f", "C", "F"]: + if arg.dim_tags is None: + arg = arg.copy(order=default_order) + else: + # leave them the way they are + pass + elif default_order is auto: + if arg.dim_tags is None and arg.shape is not None: + assert arg.shape is not auto + arg = arg.copy( + dim_tags=tuple( + FixedStrideArrayDimTag(auto) + for i in range(len(arg.shape)))) + arg = arg.copy( + dim_tags=tuple( + FixedStrideArrayDimTag(auto) + if isinstance(dim_tag, FixedStrideArrayDimTag) + else dim_tag + for dim_tag in arg.dim_tags)) + else: + raise ValueError("unexpected value for default_order: " + f"'{default_order}'") + processed_args.append(arg) return kernel.copy(args=processed_args) @@ -2196,7 +2221,10 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): :arg preamble_generators: a list of functions of signature (seen_dtypes, seen_functions) where seen_functions is a set of (name, c_name, arg_dtypes), generating extra entries for *preambles*. - :arg default_order: "C" (default) or "F" + :arg default_order: "C" (default), "F" or :class:`loopy.auto`. + The default memory layout of arrays that are not explicitly + specified. If :class:`loopy.auto`, variables for strides are + automatically created. :arg default_offset: 0 or :class:`loopy.auto`. The default value of *offset* in :attr:`ArrayArg` for guessed arguments. Defaults to 0. @@ -2299,8 +2327,9 @@ def make_function(domains, instructions, kernel_data=None, **kwargs): from loopy.version import LANGUAGE_VERSION_SYMBOLS + import loopy.version as v version_to_symbol = { - getattr(loopy.version, lvs): lvs + getattr(v, lvs): lvs for lvs in LANGUAGE_VERSION_SYMBOLS} lang_version = kwargs.pop("lang_version", None) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index 6f46214c7..e4267ab6d 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -39,8 +39,8 @@ from loopy.kernel.array import ArrayBase, ArrayDimImplementationTag from loopy.diagnostic import LoopyError -from loopy.typing import ExpressionT -from loopy.types import LoopyType +from loopy.typing import ExpressionT, ShapeType +from loopy.types import LoopyType, auto from loopy.kernel.instruction import ( # noqa InstructionBase, MemoryOrdering, @@ -117,13 +117,6 @@ def _names_from_dim_tags( # }}} -class auto: # noqa - """A generic placeholder object for something that should be automatically - determined. See, for example, the *shape* or *strides* argument of - :class:`ArrayArg`. - """ - - # {{{ iname tags def filter_iname_tags_by_type(tags, tag_type, max_num=None, min_num=None): @@ -261,6 +254,24 @@ def __str__(self): return "unr" +class UnrollHintTag(InameImplementationTag): + __slots__ = ["value"] + + def __init__(self, value=None): + ImmutableRecord.__init__(self, + value=value) + + @property + def key(self): + return (type(self).__name__, self.value) + + def __str__(self): + if self.value: + return f"unr_hint.{self.value}" + else: + return "unr_hint" + + class ForceSequentialTag(InameImplementationTag): def __str__(self): return "forceseq" @@ -271,12 +282,14 @@ def __str__(self): return "ord" -def parse_tag(tag): - from pytools.tag import Tag as TagBase +ToInameTagConvertible = Union[str, None, Tag] + + +def parse_tag(tag: ToInameTagConvertible) -> Optional[Tag]: if tag is None: return tag - if isinstance(tag, TagBase): + if isinstance(tag, Tag): return tag if not isinstance(tag, str): @@ -294,6 +307,11 @@ def parse_tag(tag): return UnrolledIlpTag() elif tag == "ilp.seq": return LoopedIlpTag() + elif tag == "unr_hint": + return UnrollHintTag() + elif tag.startswith("unr_hint."): + offset = len("unr_hint.") + return UnrollHintTag(int(tag[offset:])) elif tag.startswith("g."): return GroupInameTag(int(tag[2:])) elif tag.startswith("l."): @@ -647,8 +665,16 @@ class TemporaryVariable(ArrayBase): declaration. """ - min_target_axes = 0 - max_target_axes = 1 + storage_shape: Optional[ShapeType] + base_indices: Optional[Tuple[ExpressionT, ...]] + address_space: Union[AddressSpace, Type[auto]] + base_storage: Optional[str] + initializer: Optional[np.ndarray] + read_only: bool + _base_storage_access_may_be_aliasing: bool + + min_target_axes: ClassVar[int] = 0 + max_target_axes: ClassVar[int] = 1 allowed_extra_kwargs = ( "storage_shape", diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 1a22cbbb9..ed18e2ba4 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -31,6 +31,7 @@ from loopy.diagnostic import LoopyError from loopy.tools import Optional +from collections.abc import Set as abc_Set # {{{ instruction tags @@ -186,7 +187,7 @@ class InstructionBase(ImmutableRecord, Taggable): A :class:`frozenset` of subclasses of :class:`pytools.tag.Tag` used to provide metadata on this object. Legacy string tags are converted to :class:`LegacyStringInstructionTag` or, if they used to carry - a functional meaning, the tag carrying that same fucntional meaning + a functional meaning, the tag carrying that same functional meaning (e.g. :class:`UseStreamingStoreTag`). .. automethod:: __init__ @@ -267,7 +268,7 @@ def __init__(self, id, depends_on, depends_on_is_final, if depends_on_is_final is None: depends_on_is_final = False - if depends_on_is_final and not isinstance(depends_on, frozenset): + if depends_on_is_final and not isinstance(depends_on, abc_Set): raise LoopyError("Setting depends_on_is_final to True requires " "actually specifying depends_on") @@ -277,7 +278,7 @@ def __init__(self, id, depends_on, depends_on_is_final, if priority is None: priority = 0 - if not isinstance(tags, frozenset): + if not isinstance(tags, abc_Set): # was previously allowed to be tuple tags = frozenset(tags) @@ -292,10 +293,10 @@ def __init__(self, id, depends_on, depends_on_is_final, # assert all(is_interned(iname) for iname in within_inames) # assert all(is_interned(pred) for pred in predicates) - assert isinstance(within_inames, frozenset) - assert isinstance(depends_on, frozenset) or depends_on is None - assert isinstance(groups, frozenset) - assert isinstance(conflicts_with_groups, frozenset) + assert isinstance(within_inames, abc_Set) + assert isinstance(depends_on, abc_Set) or depends_on is None + assert isinstance(groups, abc_Set) + assert isinstance(conflicts_with_groups, abc_Set) ImmutableRecord.__init__(self, id=id, @@ -605,7 +606,7 @@ def update_persistent_hash(self, key_hash, key_builder): key_builder.rec(key_hash, self.var_name) def __eq__(self, other): - return (type(self) == type(other) + return (type(self) is type(other) and self.var_name == other.var_name) def __ne__(self, other): @@ -1062,7 +1063,8 @@ def __str__(self): result += " {%s}" % (": ".join(options)) if self.predicates: - result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates) + result += "\n" + 10*" " + "if (%s)" % " && ".join( + str(pred) for pred in self.predicates) return result def arg_id_to_arg(self): diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 99f5f3503..a86173fdc 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -2115,4 +2115,19 @@ def get_outer_params(domains): # }}} +def get_hw_axis_base_for_codegen(kernel: LoopKernel, iname: str) -> isl.Aff: + """ + Returns a :class:`isl.PwAff` hardware axes lower bound to serve as an + offsetting expression + during the hardware ina + """ + from loopy.kernel.data import HardwareConcurrentTag + from loopy.isl_helpers import static_min_of_pw_aff + + assert kernel.iname_tags_of_type(iname, HardwareConcurrentTag) + bounds = kernel.get_iname_bounds(iname) + lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, + constants_only=False) + return lower_bound + # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index 190e6bd9a..a4952d776 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -86,6 +86,9 @@ def __call__(self, dtype, operand1, operand2): def __ne__(self, other): return not self.__eq__(other) + def __repr__(self) -> str: + return type(self).__name__ + @staticmethod def parse_result_type(target, op_type): try: @@ -118,7 +121,7 @@ def __hash__(self): return hash((type(self),)) def __eq__(self, other): - return type(self) == type(other) + return type(self) is type(other) def __str__(self): result = type(self).__name__.replace("ReductionOperation", "").lower() @@ -359,7 +362,7 @@ def __hash__(self): return hash(type(self)) def __eq__(self, other): - return type(self) == type(other) and (self.inner_reduction == + return type(self) is type(other) and (self.inner_reduction == other.inner_reduction) def __call__(self, dtypes, operand1, operand2, callables_table, target): @@ -461,7 +464,7 @@ def __hash__(self): return hash(type(self)) def __eq__(self, other): - return type(self) == type(other) + return type(self) is type(other) @property def arg_count(self): diff --git a/loopy/match.py b/loopy/match.py index 624276dce..423c4ecec 100644 --- a/loopy/match.py +++ b/loopy/match.py @@ -24,8 +24,14 @@ THE SOFTWARE. """ +from abc import abstractmethod, ABC +from dataclasses import dataclass +from typing import FrozenSet, List, Sequence, Tuple, Union, Protocol from sys import intern +from loopy.kernel import LoopKernel +from loopy.kernel.instruction import InstructionBase + NoneType = type(None) @@ -33,6 +39,10 @@ import pytools.tag __doc__ = """ +.. autoclass:: Matchable +.. autoclass:: StackMatchComponent +.. autoclass:: StackMatch + .. autofunction:: parse_match .. autofunction:: parse_stack_match @@ -117,8 +127,18 @@ def re_from_glob(s): # {{{ match expression -class MatchExpressionBase: - def __call__(self, kernel, matchable): +class Matchable(Protocol): + """ + .. attribute:: tags + """ + @property + def tags(self) -> FrozenSet[pytools.tag.Tag]: + ... + + +class MatchExpressionBase(ABC): + @abstractmethod + def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool: raise NotImplementedError def __ne__(self, other): @@ -135,7 +155,7 @@ def __inv__(self): class All(MatchExpressionBase): - def __call__(self, kernel, matchable): + def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool: return True def __str__(self): @@ -148,15 +168,15 @@ def update_persistent_hash(self, key_hash, key_builder): key_builder.rec(key_hash, "all_match_expr") def __eq__(self, other): - return (type(self) == type(other)) + return type(self) is type(other) def __hash__(self): return hash(type(self)) +@dataclass(frozen=True, eq=True) class MultiChildMatchExpressionBase(MatchExpressionBase): - def __init__(self, children): - self.children = children + children: Sequence[MatchExpressionBase] def __str__(self): joiner = " %s " % type(self).__name__.lower() @@ -167,33 +187,22 @@ def __repr__(self): type(self).__name__, ", ".join(repr(ch) for ch in self.children)) - def update_persistent_hash(self, key_hash, key_builder): - key_builder.rec(key_hash, type(self).__name__) - key_builder.rec(key_hash, self.children) - - def __eq__(self, other): - return (type(self) == type(other) - and self.children == other.children) - - def __hash__(self): - return hash((type(self), self.children)) - class And(MultiChildMatchExpressionBase): - def __call__(self, kernel, matchable): + def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool: return all(ch(kernel, matchable) for ch in self.children) class Or(MultiChildMatchExpressionBase): - def __call__(self, kernel, matchable): + def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool: return any(ch(kernel, matchable) for ch in self.children) +@dataclass(frozen=True, eq=True) class Not(MatchExpressionBase): - def __init__(self, child): - self.child = child + child: MatchExpressionBase - def __call__(self, kernel, matchable): + def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool: return not self.child(kernel, matchable) def __str__(self): @@ -202,18 +211,8 @@ def __str__(self): def __repr__(self): return "{}({!r})".format(type(self).__name__, self.child) - def update_persistent_hash(self, key_hash, key_builder): - key_builder.rec(key_hash, "not_match_expr") - key_builder.rec(key_hash, self.child) - - def __eq__(self, other): - return (type(self) == type(other) - and self.child == other.child) - - def __hash__(self): - return hash((type(self), self.child)) - +@dataclass(frozen=True, eq=True) class ObjTagged(MatchExpressionBase): """Match if the object is tagged with a given :class:`~pytools.tag.Tag`. @@ -222,19 +221,14 @@ class ObjTagged(MatchExpressionBase): These instance-based tags will, in the not-too-distant future, replace the string-based tags matched by :class:`Tagged`. """ - def __init__(self, tag: pytools.tag.Tag): - self.tag = tag + tag: pytools.tag.Tag - def __call__(self, kernel, matchable): + def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool: return self.tag in matchable.tags - def update_persistent_hash(self, key_hash, key_builder): - key_builder.rec(key_hash, type(self).__name__) - key_builder.rec(key_hash, self.tag) - class GlobMatchExpressionBase(MatchExpressionBase): - def __init__(self, glob): + def __init__(self, glob: str) -> None: self.glob = glob import re @@ -253,7 +247,7 @@ def update_persistent_hash(self, key_hash, key_builder): key_builder.rec(key_hash, self.glob) def __eq__(self, other): - return (type(self) == type(other) + return (type(self) is type(other) and self.glob == other.glob) def __hash__(self): @@ -273,7 +267,8 @@ class Tagged(GlobMatchExpressionBase): These string-based tags will, in the not-too-distant future, be replace by instance-based tags matched by :class:`ObjTagged`. """ - def __call__(self, kernel, matchable): + + def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool: from loopy.kernel.instruction import LegacyStringInstructionTag if matchable.tags: return any( @@ -289,13 +284,17 @@ def __call__(self, kernel, matchable): class Writes(GlobMatchExpressionBase): - def __call__(self, kernel, matchable): + def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool: + if not isinstance(matchable, InstructionBase): + return False return any(self.re.match(name) for name in matchable.assignee_var_names()) class Reads(GlobMatchExpressionBase): - def __call__(self, kernel, matchable): + def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool: + if not isinstance(matchable, InstructionBase): + return False return any(self.re.match(name) for name in matchable.read_dependency_names()) @@ -306,7 +305,10 @@ def __call__(self, kernel, matchable): class Iname(GlobMatchExpressionBase): - def __call__(self, kernel, matchable): + def __call__(self, kernel: LoopKernel, matchable: Matchable) -> bool: + if not isinstance(matchable, InstructionBase): + return False + return any(self.re.match(name) for name in matchable.within_inames) @@ -421,39 +423,47 @@ def inner_parse(pstate, min_precedence=0): # {{{ stack match objects -class StackMatchComponent: +class StackMatchComponent(ABC): + """ + .. automethod:: __call__ + """ + + @abstractmethod + def __call__(self, kernel: LoopKernel, stack: Sequence[Matchable]) -> bool: + pass + def __ne__(self, other): return not self.__eq__(other) class StackAllMatchComponent(StackMatchComponent): - def __call__(self, kernel, stack): + def __call__(self, kernel: LoopKernel, stack: Sequence[Matchable]) -> bool: return True def update_persistent_hash(self, key_hash, key_builder): key_builder.rec(key_hash, "all_match") def __eq__(self, other): - return (type(self) == type(other)) + return type(self) is type(other) class StackBottomMatchComponent(StackMatchComponent): - def __call__(self, kernel, stack): + def __call__(self, kernel: LoopKernel, stack: Sequence[Matchable]) -> bool: return not stack def update_persistent_hash(self, key_hash, key_builder): key_builder.rec(key_hash, "bottom_match") def __eq__(self, other): - return (type(self) == type(other)) + return type(self) is type(other) +@dataclass(eq=True, frozen=True) class StackItemMatchComponent(StackMatchComponent): - def __init__(self, match_expr, inner_match): - self.match_expr = match_expr - self.inner_match = inner_match + match_expr: MatchExpressionBase + inner_match: StackMatchComponent - def __call__(self, kernel, stack): + def __call__(self, kernel: LoopKernel, stack: Sequence[Matchable]) -> bool: if not stack: return False @@ -463,22 +473,12 @@ def __call__(self, kernel, stack): return self.inner_match(kernel, stack[1:]) - def update_persistent_hash(self, key_hash, key_builder): - key_builder.rec(key_hash, "item_match") - key_builder.rec(key_hash, self.match_expr) - key_builder.rec(key_hash, self.inner_match) - - def __eq__(self, other): - return (type(self) == type(other) - and self.match_expr == other.match_expr - and self.inner_match == other.inner_match) - +@dataclass(eq=True, frozen=True) class StackWildcardMatchComponent(StackMatchComponent): - def __init__(self, inner_match): - self.inner_match = inner_match + inner_match: StackMatchComponent - def __call__(self, kernel, stack): + def __call__(self, kernel: LoopKernel, stack: Sequence[Matchable]) -> bool: for i in range(0, len(stack)): if self.inner_match(kernel, stack[i:]): return True @@ -490,10 +490,10 @@ def __call__(self, kernel, stack): # {{{ stack matcher +@dataclass(eq=True, frozen=True) class RuleInvocationMatchable: - def __init__(self, id, tags): - self.id = id - self.tags = tags + id: str + tags: FrozenSet[pytools.tag.Tag] def write_dependency_names(self): raise TypeError("writes: query may not be applied to rule invocations") @@ -505,27 +505,21 @@ def inames(self, kernel): raise TypeError("inames: query may not be applied to rule invocations") +@dataclass(eq=True, frozen=True) class StackMatch: - def __init__(self, root_component): - self.root_component = root_component - - def update_persistent_hash(self, key_hash, key_builder): - key_builder.rec(key_hash, self.root_component) - - def __eq__(self, other): - return ( - type(self) == type(other) - and - self.root_component == other.root_component) + """ + .. automethod:: __call__ + """ - def __ne__(self, other): - return not self.__eq__(other) + root_component: StackMatchComponent - def __call__(self, kernel, insn, rule_stack): + def __call__( + self, kernel: LoopKernel, insn: InstructionBase, + rule_stack: Sequence[Tuple[str, FrozenSet[pytools.tag.Tag]]]) -> bool: """ :arg rule_stack: a tuple of (name, tags) rule invocation, outermost first """ - stack_of_matchables = [insn] + stack_of_matchables: List[Matchable] = [insn] for id, tags in rule_stack: stack_of_matchables.append(RuleInvocationMatchable(id, tags)) @@ -536,7 +530,10 @@ def __call__(self, kernel, insn, rule_stack): # {{{ stack match parsing -def parse_stack_match(smatch): +ToStackMatchCovertible = Union[StackMatch, str, None] + + +def parse_stack_match(smatch: ToStackMatchCovertible) -> StackMatch: """Syntax example:: ... > outer > ... > next > innermost $ @@ -561,7 +558,7 @@ def parse_stack_match(smatch): smatch = smatch.strip() - match = StackAllMatchComponent() + match: StackMatchComponent = StackAllMatchComponent() if smatch[-1] == "$": match = StackBottomMatchComponent() smatch = smatch[:-1] diff --git a/loopy/options.py b/loopy/options.py index 4763252bc..64667463a 100644 --- a/loopy/options.py +++ b/loopy/options.py @@ -131,7 +131,7 @@ class Options(ImmutableRecord): output values. This is helpful if arguments are inferred and argument ordering is thus implementation-defined. - See :meth:`CompiledKernel.__call__`. + See :meth:`ExecutorBase.__call__`. .. attribute:: write_wrapper diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 6c9456661..a84ac4359 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -46,7 +46,7 @@ from loopy.transform.data import allocate_temporaries_for_base_storage from loopy.kernel.array import ArrayDimImplementationTag from loopy.kernel.data import _ArraySeparationInfo, KernelArgument -from loopy.translation_unit import for_each_kernel +from loopy.translation_unit import TranslationUnit, for_each_kernel from loopy.typing import ExpressionT from pytools import ProcessLogger @@ -724,7 +724,7 @@ def filter_reachable_callables(t_unit): t_unit.entrypoints) new_callables = {name: clbl for name, clbl in t_unit.callables_table.items() if name in (reachable_function_ids | t_unit.entrypoints)} - return t_unit.copy(callables_table=new_callables) + return t_unit.copy(callables_table=Map(new_callables)) def _preprocess_single_kernel(kernel: LoopKernel, is_entrypoint: bool) -> LoopKernel: @@ -788,33 +788,33 @@ def _preprocess_single_kernel(kernel: LoopKernel, is_entrypoint: bool) -> LoopKe @memoize_on_disk -def preprocess_program(program): +def preprocess_program(t_unit: TranslationUnit) -> TranslationUnit: from loopy.kernel import KernelState - if program.state >= KernelState.PREPROCESSED: - return program + if t_unit.state >= KernelState.PREPROCESSED: + return t_unit - if len([clbl for clbl in program.callables_table.values() if + if len([clbl for clbl in t_unit.callables_table.values() if isinstance(clbl, CallableKernel)]) == 1: - program = program.with_entrypoints(",".join(clbl.name for clbl in - program.callables_table.values() if isinstance(clbl, + t_unit = t_unit.with_entrypoints(",".join(clbl.name for clbl in + t_unit.callables_table.values() if isinstance(clbl, CallableKernel))) - if not program.entrypoints: + if not t_unit.entrypoints: raise LoopyError("Translation unit did not receive any entrypoints") from loopy.translation_unit import resolve_callables - program = resolve_callables(program) + t_unit = resolve_callables(t_unit) - program = filter_reachable_callables(program) + t_unit = filter_reachable_callables(t_unit) - program = infer_unknown_types(program, expect_completion=False) + t_unit = infer_unknown_types(t_unit, expect_completion=False) from loopy.transform.subst import expand_subst - program = expand_subst(program) + t_unit = expand_subst(t_unit) from loopy.kernel.creation import apply_single_writer_depencency_heuristic - program = apply_single_writer_depencency_heuristic(program) + t_unit = apply_single_writer_depencency_heuristic(t_unit) # Ordering restrictions: # @@ -826,7 +826,7 @@ def preprocess_program(program): # defaults from being applied. from loopy.transform.realize_reduction import realize_reduction - program = realize_reduction(program, unknown_types_ok=False) + t_unit = realize_reduction(t_unit, unknown_types_ok=False) # {{{ preprocess callable kernels @@ -838,11 +838,11 @@ def preprocess_program(program): # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects new_callables = {} - for func_id, in_knl_callable in program.callables_table.items(): + for func_id, in_knl_callable in t_unit.callables_table.items(): if isinstance(in_knl_callable, CallableKernel): new_subkernel = _preprocess_single_kernel( in_knl_callable.subkernel, - is_entrypoint=func_id in program.entrypoints) + is_entrypoint=func_id in t_unit.entrypoints) in_knl_callable = in_knl_callable.copy( subkernel=new_subkernel) elif isinstance(in_knl_callable, ScalarCallable): @@ -853,16 +853,16 @@ def preprocess_program(program): new_callables[func_id] = in_knl_callable - program = program.copy(callables_table=new_callables) + t_unit = t_unit.copy(callables_table=Map(new_callables)) # }}} # infer arg descrs of the callables - program = infer_arg_descr(program) + t_unit = infer_arg_descr(t_unit) # Ordering restriction: # callees with gbarrier in them must be inlined after inferrring arg_descr. - program = inline_kernels_with_gbarriers(program) + t_unit = inline_kernels_with_gbarriers(t_unit) # {{{ prepare for caching @@ -873,7 +873,7 @@ def preprocess_program(program): # }}} - return program + return t_unit # FIXME: Do we add a deprecation warning? diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 4aaa08080..f80aa6f37 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -28,6 +28,7 @@ from typing import (FrozenSet, Hashable, Sequence, AbstractSet, Any, Set, TypeVar, Mapping, Dict, Tuple, Iterator, Optional, TYPE_CHECKING) +from immutables import Map from pytools import ImmutableRecord import islpy as isl from loopy.diagnostic import LoopyError, ScheduleDebugInputError, warn_with_kernel @@ -36,7 +37,7 @@ from pytools.persistent_dict import WriteOncePersistentDict from loopy.kernel.instruction import InstructionBase -from loopy.tools import LoopyKeyBuilder +from loopy.tools import LoopyKeyBuilder, caches from loopy.version import DATA_MODEL_VERSION if TYPE_CHECKING: @@ -2202,6 +2203,9 @@ def print_longest_dead_end(): key_builder=LoopyKeyBuilder()) +caches.append(schedule_cache) + + def _get_one_linearized_kernel_inner(kernel, callables_table): # This helper function exists to ensure that the generator chain is fully # out of scope after the function returns. This allows it to be @@ -2275,7 +2279,7 @@ def linearize(t_unit): else: raise NotImplementedError(type(clbl)) - return t_unit.copy(callables_table=new_callables) + return t_unit.copy(callables_table=Map(new_callables)) # vim: foldmethod=marker diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index f2164b6dd..fd7d46876 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -335,20 +335,28 @@ def get_return_from_kernel_mapping(kernel): # {{{ check for write races in accesses -def _check_for_access_races(map_a, insn_a, map_b, insn_b, knl, callables_table): +def _check_for_access_races(map_a, insn_a, map_b, insn_b, knl, callables_table, + address_space): """ Returns *True* if the execution instances of *insn_a* and *insn_b*, accessing the same variable via access maps *map_a* and *map_b*, result in an access race. + :arg address_space: An instance of :class:`loopy.kernel.data.AddressSpace` + of the variable whose accesses are being checked for a race. + .. note:: The accesses ``map_a``, ``map_b`` lead to write races iff there exists 2 *unequal* global ids that access the same address. """ import pymbolic.primitives as p - from loopy.symbolic import isl_set_from_expr + from loopy.symbolic import isl_set_from_expr, aff_from_expr, aff_to_expr from loopy.kernel.data import (filter_iname_tags_by_type, - HardwareConcurrentTag) + HardwareConcurrentTag, + AddressSpace) + from loopy.kernel.tools import get_hw_axis_base_for_codegen + + assert address_space in [AddressSpace.LOCAL, AddressSpace.GLOBAL] gsize, lsize = knl.get_grid_size_upper_bounds(callables_table, return_dict=True) @@ -357,9 +365,10 @@ def _check_for_access_races(map_a, insn_a, map_b, insn_b, knl, callables_table): # Step 1.1: Project out inames which are also map's dims, but does not form the # insn's within_inames - # Step 1.2: Project out sequential inames in the access maps - # Step 1.3: Rename the dims with their iname tags i.e. (g.i or l.i) - # Step 1.4: Name the ith output dims as _lp_dim{i} + # Step 1.2: Perform any offsetting required to the hw axes iname terms + # Step 1.3: Project out sequential inames in the access maps + # Step 1.4: Rename the dims with their iname tags i.e. (g.i or l.i) + # Step 1.5: Name the ith output dims as _lp_dim{i} updated_maps = [] @@ -381,6 +390,36 @@ def _check_for_access_races(map_a, insn_a, map_b, insn_b, knl, callables_table): if dt == isl.dim_type.in_: tag, = filter_iname_tags_by_type(knl.inames[name].tags, HardwareConcurrentTag) + + iname_lower_bound = get_hw_axis_base_for_codegen(knl, name) + + if not iname_lower_bound.plain_is_zero(): + # Hardware inames with nonzero base have an offset applied in + # code generation: + # https://github.com/inducer/loopy/blob/4e0b1c7635afe1473c8636377f8e7ef6d78dfd46/loopy/codegen/loop.py#L293-L297 + # https://github.com/inducer/loopy/issues/600#issuecomment-1104066735 + + map_ = map_.add_dims(isl.dim_type.out, 1) + map_ = map_.move_dims( + isl.dim_type.in_, pos+1, + isl.dim_type.out, map_.dim(isl.dim_type.out)-1, + 1 + ) + map_ = map_.set_dim_name(isl.dim_type.in_, pos+1, name+"'") + + lbound_offset_expr_aff = aff_from_expr( + map_.domain().space, + (p.Variable(name+"'") + + aff_to_expr(iname_lower_bound) + - p.Variable(name)) + ) + lbound_offset_as_domain = lbound_offset_expr_aff.zero_basic_set() + map_ = map_.intersect_domain(lbound_offset_as_domain) + + map_ = map_.project_out(dt, pos, 1) + assert map_.get_dim_name(dt, pos) == name+"'" + map_ = map_.set_dim_name(dt, pos, name) + map_ = map_.set_dim_name(dt, pos, str(tag)) for i_l in lsize: @@ -438,25 +477,40 @@ def _check_for_access_races(map_a, insn_a, map_b, insn_b, knl, callables_table): # {{{ Step 5: create the set any(l.i.A != l.i.B) OR any(g.i.A != g.i.B) space = set_a.space - unequal_global_id_set = isl.Set.empty(set_a.get_space()) + unequal_local_id_set = isl.Set.empty(set_a.get_space()) + unequal_group_id_set = isl.Set.empty(set_a.get_space()) + equal_group_id_set = isl.BasicSet.universe(set_a.get_space()) for i_l in lsize: lid_a = p.Variable(f"l.{i_l}.A") lid_b = p.Variable(f"l.{i_l}.B") - unequal_global_id_set |= (isl_set_from_expr(space, - p.Comparison(lid_a, "!=", lid_b)) - ) + unequal_local_id_set |= (isl_set_from_expr(space, + p.Comparison(lid_a, "!=", lid_b)) + ) for i_g in gsize: gid_a = p.Variable(f"g.{i_g}.A") gid_b = p.Variable(f"g.{i_g}.B") - unequal_global_id_set |= (isl_set_from_expr(space, - p.Comparison(gid_a, "!=", gid_b)) - ) + unequal_group_id_set |= (isl_set_from_expr(space, + p.Comparison(gid_a, "!=", gid_b)) + ) + equal_group_id_set &= (isl_set_from_expr(space, + p.Comparison(gid_a, "==", gid_b)) + ) # }}} - return not (set_a & set_b & unequal_global_id_set).is_empty() + if address_space == AddressSpace.GLOBAL: + return not (set_a + & set_b + & (unequal_local_id_set + | unequal_group_id_set) + ).is_empty() + else: + return not (set_a + & set_b + & unequal_local_id_set + & equal_group_id_set).is_empty() class AccessMapDescriptor(enum.Enum): @@ -550,7 +604,10 @@ def do_accesses_result_in_races(self, insn1, insn1_dir, insn2, insn2_dir, return _check_for_access_races(insn1_amap, self.kernel.id_to_insn[insn1], insn2_amap, self.kernel.id_to_insn[insn2], - self.kernel, self.callables_table) + self.kernel, self.callables_table, + (self.kernel + .get_var_descriptor(var_name) + .address_space)) # }}} diff --git a/loopy/symbolic.py b/loopy/symbolic.py index fd6013416..99cc56571 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -24,7 +24,7 @@ """ -from typing import ClassVar, Tuple +from typing import AbstractSet, ClassVar, Mapping, Sequence, Tuple from functools import reduce, cached_property from sys import intern import re @@ -69,6 +69,7 @@ from loopy.diagnostic import LoopyError from loopy.diagnostic import (ExpressionToAffineConversionError, UnableToDetermineAccessRangeError) +from loopy.typing import ExpressionT __doc__ = """ @@ -604,7 +605,7 @@ class TypedCSE(LoopyExpressionBase, p.CommonSubexpression): """ def __init__(self, child, prefix=None, dtype=None): - super().__init__(child, prefix) + super().__init__(child, prefix=prefix, scope=p.cse_scope.EVALUATION) self.dtype = dtype def __getinitargs__(self): @@ -1042,11 +1043,11 @@ def _get_dependencies_and_reduction_inames(expr): return deps, reduction_inames -def get_dependencies(expr): +def get_dependencies(expr: ExpressionT) -> AbstractSet[str]: return _get_dependencies_and_reduction_inames(expr)[0] -def get_reduction_inames(expr): +def get_reduction_inames(expr: ExpressionT) -> AbstractSet[str]: return _get_dependencies_and_reduction_inames(expr)[1] @@ -1329,7 +1330,12 @@ def map_call(self, expr, expn_state, *args, **kwargs): *args, **kwargs) @staticmethod - def make_new_arg_context(rule_name, arg_names, arguments, arg_context): + def make_new_arg_context( + rule_name: str, + arg_names: Sequence[str], + arguments: Sequence[ExpressionT], + arg_context: Mapping[str, ExpressionT] + ) -> Mapping[str, ExpressionT]: if len(arg_names) != len(arguments): raise RuntimeError("Rule '%s' invoked with %d arguments (needs %d)" % (rule_name, len(arguments), len(arg_names), )) @@ -1577,7 +1583,8 @@ def map_call(self, expr): tag = None return p.CommonSubexpression( - self.rec(expr.parameters[0]), tag) + self.rec(expr.parameters[0]), tag, + scope=p.cse_scope.EVALUATION) else: raise TypeError("cse takes two arguments") @@ -1601,6 +1608,16 @@ def map_call(self, expr): else: raise TypeError("if takes three arguments") + elif name in ["minimum", "maximum"]: + if len(expr.parameters) == 2: + from pymbolic.primitives import Min, Max + return { + "minimum": Min, + "maximum": Max + }[name](tuple(self.rec(p) for p in expr.parameters)) + else: + raise TypeError(f"{name} takes two arguments") + else: # see if 'name' is an existing reduction op diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 54a0729da..be04d1008 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -23,7 +23,6 @@ from __future__ import annotations - __copyright__ = "Copyright (C) 2015 Andreas Kloeckner" __license__ = """ @@ -54,30 +53,42 @@ from loopy.typing import ExpressionT from loopy.codegen import CodeGenerationState from loopy.codegen.result import CodeGenerationResult + from loopy.target.execution import ExecutorBase + from loopy.translation_unit import TranslationUnit, FunctionIdT ASTType = TypeVar("ASTType") -class TargetBase(): +class TargetBase: """Base class for all targets, i.e. different combinations of code that loopy can generate. Objects of this type must be picklable. """ - # {{{ persistent hashing + # {{{ hashing/equality hash_fields: ClassVar[Tuple[str, ...]] = () comparison_fields: ClassVar[Tuple[str, ...]] = () + def __hash__(self): + # NOTE: _hash_value may vanish during pickling + if getattr(self, "_hash_value", None) is None: + from loopy.tools import LoopyKeyBuilder + key_hash = LoopyKeyBuilder.new_hash() + LoopyKeyBuilder()(self) + object.__setattr__(self, "_hash_value", hash(key_hash.digest())) + + return self._hash_value # pylint: disable=no-member + def update_persistent_hash(self, key_hash, key_builder): key_hash.update(type(self).__name__.encode()) for field_name in self.hash_fields: key_builder.rec(key_hash, getattr(self, field_name)) def __eq__(self, other): - if type(self) != type(other): + if type(self) is not type(other): return False for field_name in self.comparison_fields: @@ -152,7 +163,9 @@ def get_kernel_executor_cache_key(self, *args, **kwargs): """ raise NotImplementedError() - def get_kernel_executor(self, kernel, *args, **kwargs): + def get_kernel_executor( + self, t_unit: TranslationUnit, *args, entrypoint: FunctionIdT, + **kwargs) -> ExecutorBase: """ :returns: an immutable type to be used as the cache key for kernel executor caching. @@ -164,7 +177,7 @@ class ASTBuilderBase(Generic[ASTType]): """An interface for generating (host or device) ASTs. """ - def __init__(self, target): + def __init__(self, target) -> None: self.target = target # {{{ library @@ -249,7 +262,10 @@ def emit_multiple_assignment(self, codegen_state, insn): raise NotImplementedError() def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - static_lbound, static_ubound, inner): + static_lbound, static_ubound, inner, hints): + raise NotImplementedError() + + def emit_unroll_hint(self, value): raise NotImplementedError() @property diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index acccbbf38..06dc2f099 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -23,12 +23,13 @@ THE SOFTWARE. """ -from typing import cast, Tuple, Optional, Sequence +from typing import cast, Tuple, Optional, Sequence, Any import re import numpy as np # noqa -from cgen import Pointer, NestedDeclarator, Block, Generable, Declarator, Const +from cgen import (Collection, Pointer, NestedDeclarator, Block, Generable, + Declarator, Const) from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE import pymbolic.primitives as p @@ -37,6 +38,8 @@ from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError, LoopyTypeError from loopy.symbolic import IdentityMapper +from loopy.target.execution import ExecutorBase +from loopy.translation_unit import FunctionIdT, TranslationUnit from loopy.types import NumpyType, LoopyType from loopy.typing import ExpressionT from loopy.kernel import LoopKernel @@ -244,12 +247,9 @@ def _preamble_generator(preamble_info, func_qualifier="inline"): {res_ctype} y = 1; while (n > 1) {{ - if (n % 2) {{ + if (n % 2) y = x * y; - x = x * x; - }} - else - x = x * x; + x = x * x; n = n / 2; }} @@ -438,12 +438,6 @@ def dtype_to_typename(self, dtype): # These kind of shouldn't be here. return self.get_dtype_registry().dtype_to_ctype(dtype) - def get_kernel_executor_cache_key(self, *args, **kwargs): - raise NotImplementedError - - def get_kernel_executor(self, knl, *args, **kwargs): - raise NotImplementedError - # }}} @@ -784,9 +778,6 @@ def get_function_definition( from cgen import ( FunctionBody, - - # Post-mid-2016 cgens have 'Collection', too. - Module as Collection, Initializer, Line) @@ -1092,6 +1083,7 @@ def get_temporary_var_declarator(self, if temp_var.storage_shape: shape = temp_var.storage_shape else: + assert isinstance(temp_var.shape, tuple) shape = temp_var.shape assert isinstance(shape, tuple) @@ -1111,6 +1103,7 @@ def get_temporary_var_declarator(self, from cgen import AlignedAttribute temp_var_decl = AlignedAttribute(temp_var.alignment, temp_var_decl) + assert isinstance(temp_var.address_space, AddressSpace) return self.wrap_decl_for_address_space(temp_var_decl, temp_var.address_space) @@ -1224,7 +1217,7 @@ def emit_multiple_assignment(self, codegen_state, insn): in_knl_callable_as_call)) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - lbound, ubound, inner): + lbound, ubound, inner, hints): ecm = codegen_state.expression_to_code_mapper from pymbolic import var @@ -1232,7 +1225,7 @@ def emit_sequential_loop(self, codegen_state, iname, iname_dtype, from pymbolic.mapper.stringifier import PREC_NONE from cgen import For, InlineInitializer - return For( + loop = For( InlineInitializer( POD(self, iname_dtype, iname), ecm(lbound, PREC_NONE, "i")), @@ -1245,6 +1238,18 @@ def emit_sequential_loop(self, codegen_state, iname, iname_dtype, "++%s" % iname, inner) + if hints: + return Collection(list(hints) + [loop]) + else: + return loop + + def emit_unroll_hint(self, value): + from cgen import Pragma + if value: + return Pragma(f"unroll {value}") + else: + return Pragma("unroll") + def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): decl = POD(self, dtype, name) @@ -1378,10 +1383,11 @@ def get_kernel_executor_cache_key(self, *args, **kwargs): # and None isn't allowed in that setting. return _CExecutorCacheKey - def get_kernel_executor(self, t_unit, *args, **kwargs): - from loopy.target.c.c_execution import CKernelExecutor - return CKernelExecutor(t_unit, entrypoint=kwargs.pop("entrypoint"), - compiler=self.compiler) + def get_kernel_executor( + self, t_unit: TranslationUnit, + *args: Any, entrypoint: FunctionIdT, **kwargs: Any) -> ExecutorBase: + from loopy.target.c.c_execution import CExecutor + return CExecutor(t_unit, entrypoint=entrypoint, compiler=self.compiler) def get_host_ast_builder(self): # enable host code generation diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 557c67f03..b1685cad1 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -43,7 +43,7 @@ from loopy.schedule.tools import KernelArgInfo from loopy.codegen.result import GeneratedProgram from loopy.translation_unit import TranslationUnit -from loopy.target.execution import (KernelExecutorBase, +from loopy.target.execution import (ExecutorBase, ExecutionWrapperGeneratorBase, get_highlighted_code) import logging @@ -56,6 +56,17 @@ def _lpy_even_div(a, b): # FIXME: This error message is kind of crummy. raise ValueError("expected even division") return result + + +def _lpy_even_div_none(a, b): + if a is None: + return None + + result, remdr = divmod(a, b) + if remdr != 0: + # FIXME: This error message is kind of crummy. + raise ValueError("expected even division") + return result """ @@ -269,16 +280,17 @@ def __init__(self, toolchain=None, # default args self.toolchain = GCCToolchain( cc="gcc", + ld="ld", cflags="-std=c99 -O3 -fPIC".split(), ldflags="-shared".split(), libraries=[], library_dirs=[], defines=[], undefines=[], - source_suffix="c", so_ext=".so", o_ext=".o", - include_dirs=[]) + include_dirs=[], + features=set()) if toolchain is None: # copy in all differing values @@ -443,9 +455,9 @@ class _KernelInfo: invoker: Callable[..., Any] -# {{{ CKernelExecutor +# {{{ CExecutor -class CKernelExecutor(KernelExecutorBase): +class CExecutor(ExecutorBase): """An object connecting a kernel to a :class:`CompiledKernel` for execution. @@ -472,14 +484,9 @@ def get_wrapper_generator(self): return CExecutionWrapperGenerator() @memoize_method - def translation_unit_info( - self, entrypoint: str, + def translation_unit_info(self, arg_to_dtype: Optional[Map[str, LoopyType]] = None) -> _KernelInfo: - # FIXME: Remove entrypoint argument - assert entrypoint == self.entrypoint - - t_unit = self.get_typed_and_scheduled_translation_unit( - entrypoint, arg_to_dtype) + t_unit = self.get_typed_and_scheduled_translation_unit(arg_to_dtype) from loopy.codegen import generate_code_v2 codegen_result = generate_code_v2(t_unit) @@ -488,18 +495,18 @@ def translation_unit_info( host_code = codegen_result.host_code() all_code = "\n".join([dev_code, "", host_code]) - if t_unit[entrypoint].options.write_code: + if t_unit[self.entrypoint].options.write_code: output = all_code - if t_unit[entrypoint].options.allow_terminal_colors: + if t_unit[self.entrypoint].options.allow_terminal_colors: output = get_highlighted_code(output) - if t_unit[entrypoint].options.write_code is True: + if t_unit[self.entrypoint].options.write_code is True: print(output) else: - with open(t_unit[entrypoint].options.write_code, "w") as outf: + with open(t_unit[self.entrypoint].options.write_code, "w") as outf: outf.write(output) - if t_unit[entrypoint].options.edit_code: + if t_unit[self.entrypoint].options.edit_code: from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.c") # update code from editor @@ -508,18 +515,18 @@ def translation_unit_info( c_kernels = [] from loopy.schedule.tools import get_kernel_arg_info - kai = get_kernel_arg_info(t_unit[entrypoint]) + kai = get_kernel_arg_info(t_unit[self.entrypoint]) for dp in codegen_result.device_programs: c_kernels.append(CompiledCKernel( - t_unit[entrypoint], dp, kai.passed_names, all_code, + t_unit[self.entrypoint], dp, kai.passed_names, all_code, self.compiler)) return _KernelInfo( t_unit=t_unit, c_kernels=c_kernels, - invoker=self.get_invoker(t_unit, entrypoint, codegen_result)) + invoker=self.get_invoker(t_unit, self.entrypoint, codegen_result)) - def __call__(self, *args, entrypoint=None, **kwargs): + def __call__(self, *args, **kwargs): """ :returns: ``(None, output)`` the output is a tuple of output arguments (arguments that are written as part of the kernel). The order is given @@ -529,16 +536,13 @@ def __call__(self, *args, entrypoint=None, **kwargs): :class:`dict` instead, with keys of argument names and values of the returned arrays. """ - assert entrypoint is not None - if __debug__: self.check_for_required_array_arguments(kwargs.keys()) if self.packing_controller is not None: kwargs = self.packing_controller(kwargs) - program_info = self.translation_unit_info(entrypoint, - self.arg_to_dtype(kwargs)) + program_info = self.translation_unit_info(self.arg_to_dtype(kwargs)) return program_info.invoker( program_info.c_kernels, *args, **kwargs) diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 2f7ef35f5..f0c1fabd5 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -41,6 +41,7 @@ from loopy.tools import is_integer from loopy.types import LoopyType from loopy.target.c import CExpression +from loopy.typing import ExpressionT __doc__ = """ @@ -84,7 +85,7 @@ def with_assignments(self, names_to_vars): type_inf_mapper = self.type_inf_mapper.with_assignments(names_to_vars) return type(self)(self.codegen_state, self.fortran_abi, type_inf_mapper) - def infer_type(self, expr): + def infer_type(self, expr: ExpressionT) -> LoopyType: result = self.type_inf_mapper(expr) assert isinstance(result, LoopyType) @@ -331,7 +332,7 @@ def map_linear_subscript(self, expr, type_context): def make_subscript(self, array, base_expr, subscript): return base_expr[subscript] - def map_integer_div_operator(self, base_func_name, op_func, expr, type_context): + def _map_integer_div_operator(self, base_func_name, op_func, expr, type_context): from loopy.symbolic import get_dependencies iname_deps = get_dependencies(expr) & self.kernel.all_inames() domain = self.kernel.get_inames_domain(iname_deps) @@ -342,6 +343,11 @@ def map_integer_div_operator(self, base_func_name, op_func, expr, type_context): num_type = self.infer_type(expr.numerator) den_type = self.infer_type(expr.denominator) + + if not num_type.is_integral() or not den_type.is_integral(): + raise NotImplementedError("remainder and floordiv " + "for floating-point types") + from loopy.isl_helpers import is_nonnegative num_nonneg = is_nonnegative(expr.numerator, domain) \ or num_type.numpy_dtype.kind == "u" @@ -362,10 +368,10 @@ def seen_func(name): if den_nonneg: if num_nonneg: return op_func( - self.rec(expr.numerator, type_context), - self.rec(expr.denominator, type_context)) + self.rec(expr.numerator, "i"), + self.rec(expr.denominator, "i")) else: - seen_func("%s_pos_b" % base_func_name) + seen_func(f"{base_func_name}_pos_b") return var(f"{base_func_name}_pos_b_{suffix}")( self.rec(expr.numerator, "i"), self.rec(expr.denominator, "i")) @@ -377,7 +383,7 @@ def seen_func(name): def map_floor_div(self, expr, type_context): import operator - return self.map_integer_div_operator( + return self._map_integer_div_operator( "loopy_floor_div", operator.floordiv, expr, type_context) def map_remainder(self, expr, type_context): @@ -386,7 +392,7 @@ def map_remainder(self, expr, type_context): raise RuntimeError("complex remainder not defined") import operator - return self.map_integer_div_operator( + return self._map_integer_div_operator( "loopy_mod", operator.mod, expr, type_context) def map_if(self, expr, type_context): @@ -431,10 +437,8 @@ def map_constant(self, expr, type_context): " The generated code will be equivalent with the added benefit" " of sound pickling/unpickling of kernel objects.") from pymbolic.primitives import NaN - if not isinstance(expr, np.generic): - return self.map_nan(NaN(), type_context) - else: - return self.map_nan(NaN(expr.dtype.type), type_context) + data_type = expr.dtype.type if isinstance(expr, np.generic) else None + return self.map_nan(NaN(data_type), type_context) elif np.isneginf(expr): return -p.Variable("INFINITY") elif np.isinf(expr): @@ -788,13 +792,10 @@ def _map_division_operator(self, operator, expr, enclosing_prec): force_parens_around=self.multiplicative_primitives) return self.parenthesize_if_needed( - "{} {} {}".format( - # Space is necessary--otherwise '/*' - # (i.e. divide-dererference) becomes - # start-of-comment in C. - num_s, - operator, - denom_s), + f"{num_s} {operator} {denom_s}", + # Space is necessary--otherwise '/*' + # (i.e. divide-dererference) becomes + # start-of-comment in C. enclosing_prec, PREC_PRODUCT) def map_quotient(self, expr, enclosing_prec): diff --git a/loopy/target/c/compyte b/loopy/target/c/compyte index 165b3abae..d4549d4c7 160000 --- a/loopy/target/c/compyte +++ b/loopy/target/c/compyte @@ -1 +1 @@ -Subproject commit 165b3abae63bc39124a342ce1a539adbf6cd8a09 +Subproject commit d4549d4c711513e2cc098d3f5d4e918eac53ee7a diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index f20b8c15a..1c5e601d4 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -525,9 +525,9 @@ def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, cast_str = "(%s *) " % (ctype) return Block([ - POD(self, NumpyType(lhs_dtype.dtype, target=self.target), + POD(self, NumpyType(lhs_dtype.dtype), old_val_var), - POD(self, NumpyType(lhs_dtype.dtype, target=self.target), + POD(self, NumpyType(lhs_dtype.dtype), new_val_var), DoWhile( "atomicCAS(" diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 2584fc4ab..1e49de938 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -38,7 +38,7 @@ logger = logging.getLogger(__name__) from pytools.persistent_dict import WriteOncePersistentDict -from loopy.tools import LoopyKeyBuilder +from loopy.tools import LoopyKeyBuilder, caches from loopy.typing import ExpressionT from loopy.types import LoopyType, NumpyType from loopy.kernel import KernelState, LoopKernel @@ -107,11 +107,11 @@ class _ArgFindingEquation: lhs: ExpressionT rhs: ExpressionT - # Arg finding code is sorted by priority, lowest order first + # Arg finding code is sorted by priority, all equations (across all unknowns) + # of lowest priority first. order: int based_on_names: FrozenSet[str] - require_names: bool class ExecutionWrapperGeneratorBase(ABC): @@ -164,8 +164,6 @@ def generate_integer_arg_finding_from_array_data( equations: List[_ArgFindingEquation] = [] - from pymbolic.primitives import If - for arg_name in kai.passed_arg_names: arg = kernel.arg_dict[arg_name] assert arg.dtype is not None @@ -179,10 +177,10 @@ def generate_integer_arg_finding_from_array_data( lhs=var(arg.name).attr("shape").index(axis_nr), rhs=shape_i, order=0, - based_on_names=frozenset({arg.name}), - require_names=True)) + based_on_names=frozenset({arg.name}))) - for axis_nr, stride_i in enumerate(get_strides(arg)): + strides = get_strides(arg) + for axis_nr, stride_i in enumerate(strides): if stride_i is not None: equations.append( _ArgFindingEquation( @@ -192,43 +190,68 @@ def generate_integer_arg_finding_from_array_data( rhs=_str_to_expr(stride_i), order=0, based_on_names=frozenset({arg.name}), - require_names=True)) - - if arg.offset is not None: - if not kernel.options.no_numpy: - offset = var("getattr")(var(arg.name), var('"offset"'), 0) - else: - offset = var(arg.name).attr("offset") + )) - offset = If(var(f"{arg.name} is None"), 0, offset) + if not arg.is_input and isinstance(arg.shape, tuple): + # If no value was found by other means, provide + # C-contiguous default strides for output-only + # arguments. + equations.append( + _ArgFindingEquation( + lhs=(strides[axis_nr + 1] + * arg.shape[axis_nr + 1]) + if axis_nr + 1 < len(strides) + else 1, + rhs=_str_to_expr(stride_i), + # Find strides from last dim to first, + # starting at order=1 so that shape + # parameters (found above) are + # available. + order=len(strides) - axis_nr, + based_on_names=frozenset(), + )) + if arg.offset is not None: equations.append( _ArgFindingEquation( - lhs=var("_lpy_even_div")( - offset, arg.dtype.itemsize), + lhs=var("_lpy_even_div_none")( + var("getattr")( + var(arg.name), var('"offset"'), var("None")), + arg.dtype.itemsize), rhs=_str_to_expr(arg.offset), + order=0, + based_on_names=frozenset([arg.name]), + )) - # Argument finding from offsets should run last, - # as it assumes a zero offset if a variable is - # not passed. That should only be done if no - # other approach yielded a value for the variable. + # If no value was found by other means, default to zero. + equations.append( + _ArgFindingEquation( + lhs=0, + rhs=_str_to_expr(arg.offset), order=1, - based_on_names=frozenset(arg.name), - require_names=False, + based_on_names=frozenset(), )) # }}} # {{{ regroup equations by unknown - unknown_to_equations: Dict[str, List[_ArgFindingEquation]] = {} + order_to_unknown_to_equations: \ + Dict[int, Dict[str, List[_ArgFindingEquation]]] = {} for eqn in equations: deps = dep_map(eqn.rhs) if len(deps) == 1: unknown_var, = deps - unknown_to_equations.setdefault(unknown_var.name, []).append((eqn)) + order_to_unknown_to_equations \ + .setdefault(eqn.order, {}) \ + .setdefault(unknown_var.name, []) \ + .append((eqn)) + else: + # Zero deps: nothing to determine, forget about it. + # 2+ deps: not implemented + pass del equations @@ -243,72 +266,67 @@ def generate_integer_arg_finding_from_array_data( gen("# {{{ find integer arguments from array data") gen("") - for unknown_name in sorted(unknown_to_equations): - unk_equations = sorted(unknown_to_equations[unknown_name], - key=lambda eqn: eqn.order) - req_subgen = CodeGenerator() - not_req_subgen = CodeGenerator() + for order_value in sorted(order_to_unknown_to_equations): + for unknown_name in sorted(order_to_unknown_to_equations[order_value]): + unk_equations = sorted( + order_to_unknown_to_equations[order_value][unknown_name], + key=lambda eqn: eqn.order) + subgen = CodeGenerator() - seen_based_on_names: Set[FrozenSet[str]] = set() + seen_based_on_names: Set[FrozenSet[str]] = set() - if_or_elif = "if" + if_or_elif = "if" - for eqn in unk_equations: - try: - # overkill :) - value_expr = solve_affine_equations_for( - [unknown_name], - [(eqn.lhs, eqn.rhs)] - )[Variable(unknown_name)] - except Exception as e: - # went wrong? oh well - from warnings import warn - warn("Unable to generate code to automatically " - f"find '{unknown_name}' " - f"from '{', '.join(eqn.based_on_names)}':\n" - f"{e}", ParameterFinderWarning) - continue - - # Do not use more than one bit of data from each of the - # 'based_on_names' to find each value, i.e. if a value can be - # found via shape and strides, only one of them suffices. - # This also helps because strides can be unreliable in the - # face of zero-length axes. - if eqn.based_on_names in seen_based_on_names: - continue - seen_based_on_names.add(eqn.based_on_names) - - if eqn.require_names: - condition = " and ".join( - f"{ary_name} is not None" - for ary_name in eqn.based_on_names) - req_subgen(f"{if_or_elif} {condition}:") - with Indentation(req_subgen): - req_subgen( + for eqn in unk_equations: + if eqn.rhs == Variable(unknown_name): + # Some of the expressions above are non-affine. Let's not + # get carried away by trying to solve a much more complex + # problem than needed. + value_expr = eqn.lhs + else: + try: + # overkill :) + value_expr = solve_affine_equations_for( + [unknown_name], + [(eqn.lhs, eqn.rhs)] + )[Variable(unknown_name)] + except Exception as e: + # went wrong? oh well + from warnings import warn + warn("Unable to generate code to automatically " + f"find '{unknown_name}' " + f"from '{', '.join(eqn.based_on_names)}':\n" + f"{e}", ParameterFinderWarning) + continue + + # Do not use more than one bit of data from each of the + # 'based_on_names' to find each value, i.e. if a value can be + # found via shape and strides, only one of them suffices. + # This also helps because strides can be unreliable in the + # face of zero-length axes. + if eqn.based_on_names in seen_based_on_names: + continue + seen_based_on_names.add(eqn.based_on_names) + + if eqn.based_on_names: + condition = " and ".join( + f"{ary_name} is not None" + for ary_name in eqn.based_on_names) + else: + condition = "True" + + subgen(f"{if_or_elif} {condition}:") + with Indentation(subgen): + subgen( f"{unknown_name} = {StringifyMapper()(value_expr)}") if_or_elif = "elif" - req_subgen("") - else: - not_req_subgen( - f"{unknown_name} = {StringifyMapper()(value_expr)}") - - not_req_subgen("") + subgen("") - if not_req_subgen.code: - gen(f"if {unknown_name} is None:") - with Indentation(gen): - gen.extend(not_req_subgen) - - if req_subgen.code: - # still? try the req_subgen - gen(f"if {unknown_name} is None:") - with Indentation(gen): - gen.extend(req_subgen) - elif req_subgen.code: - gen(f"if {unknown_name} is None:") - with Indentation(gen): - gen.extend(req_subgen) + if subgen.code: + gen(f"if {unknown_name} is None:") + with Indentation(gen): + gen.extend(subgen) gen("# }}}") gen("") @@ -708,18 +726,24 @@ def __call__(self, program, entrypoint, codegen_result): key_builder=LoopyKeyBuilder()) +caches.append(typed_and_scheduled_cache) + + invoker_cache = WriteOncePersistentDict( "loopy-invoker-cache-v10-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) +caches.append(invoker_cache) + + # {{{ kernel executor -class KernelExecutorBase: - """An object connecting a kernel to a :class:`pyopencl.Context` - for execution. +class ExecutorBase: + """An object allowing the execution of an entrypoint of a + :class:`~loopy.TranslationUnit`. Create these objects using + :meth:`loopy.TranslationUnit.executor`. - .. automethod:: __init__ .. automethod:: __call__ """ packing_controller: Optional[SeparateArrayPackingController] @@ -753,14 +777,14 @@ def __init__(self, t_unit: TranslationUnit, entrypoint: str): self.packing_controller = SeparateArrayPackingController(self.sep_info) else: self.packing_controller = None - return None def check_for_required_array_arguments(self, input_args): # Formerly, the first exception raised when a required argument is not # passed was often at type inference. This exists to raise a more meaningful # message in such scenarios. Since type inference precedes compilation, this # check cannot be deferred to the generated invoker code. - # See discussion at github.com/inducer/loopy/pull/160#issuecomment-867761204 + # See discussion at + # https://github.com/inducer/loopy/pull/160#issuecomment-867761204 # and links therin for context. if not self.input_array_names <= set(input_args): missing_args = self.input_array_names - set(input_args) @@ -772,12 +796,12 @@ def check_for_required_array_arguments(self, input_args): "your argument.") def get_typed_and_scheduled_translation_unit_uncached( - self, entrypoint, arg_to_dtype: Optional[Map[str, LoopyType]] + self, arg_to_dtype: Optional[Map[str, LoopyType]] ) -> TranslationUnit: t_unit = self.t_unit if arg_to_dtype: - entry_knl = t_unit[entrypoint] + entry_knl = t_unit[self.entrypoint] # FIXME: This is not so nice. This transfers types from the # subarrays of sep-tagged arrays to the 'main' array, because @@ -809,7 +833,7 @@ def get_typed_and_scheduled_translation_unit_uncached( return t_unit def get_typed_and_scheduled_translation_unit( - self, entrypoint: str, arg_to_dtype: Optional[Map[str, LoopyType]] + self, arg_to_dtype: Optional[Map[str, LoopyType]] ) -> TranslationUnit: from loopy import CACHING_ENABLED @@ -824,8 +848,7 @@ def get_typed_and_scheduled_translation_unit( logger.debug("%s: typed-and-scheduled cache miss" % self.t_unit.entrypoints) - kernel = self.get_typed_and_scheduled_translation_unit_uncached(entrypoint, - arg_to_dtype) + kernel = self.get_typed_and_scheduled_translation_unit_uncached(arg_to_dtype) if CACHING_ENABLED: typed_and_scheduled_cache.store_if_not_present(cache_key, kernel) @@ -861,8 +884,7 @@ def get_highlighted_code(self, entrypoint, arg_to_dtype=None, code=None): def get_code( self, entrypoint: str, arg_to_dtype: Optional[Map[str, LoopyType]] = None) -> str: - kernel = self.get_typed_and_scheduled_translation_unit( - entrypoint, arg_to_dtype) + kernel = self.get_typed_and_scheduled_translation_unit(arg_to_dtype) from loopy.codegen import generate_code_v2 code = generate_code_v2(kernel) diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index e5ce78c58..217f7a795 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -31,7 +31,7 @@ from pymbolic import var from pymbolic.mapper.stringifier import PREC_NONE from pytools import memoize_method -from cgen import Generable, Declarator, Const +from cgen import Generable, Declarator, Const, Collection from loopy.target.c import CFamilyTarget, CFamilyASTBuilder from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper @@ -476,7 +476,7 @@ def emit_assignment(self, codegen_state, insn): return Assign(ecm(lhs, prec=PREC_NONE, type_context=None), rhs_code) def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - lbound, ubound, inner): + lbound, ubound, inner, hints): ecm = codegen_state.expression_to_code_mapper from loopy.target.c import POD @@ -486,7 +486,7 @@ def emit_sequential_loop(self, codegen_state, iname, iname_dtype, from cgen.ispc import ISPCUniform - return For( + loop = For( InlineInitializer( ISPCUniform(POD(self, iname_dtype, iname)), ecm(lbound, PREC_NONE, "i")), @@ -496,6 +496,11 @@ def emit_sequential_loop(self, codegen_state, iname, iname_dtype, "++%s" % iname, inner) + if hints: + return Collection(list(hints) + [loop]) + else: + return loop + # }}} diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index c807a5360..247c00f02 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -782,6 +782,14 @@ def emit_atomic_init(self, codegen_state, lhs_atomicity, lhs_var, return self.emit_atomic_update(codegen_state, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, rhs_type_context) + def emit_unroll_hint(self, value): + # See https://man.opencl.org/attributes-loopUnroll.html + from cgen import Line + if value: + return Line(f"__attribute__((opencl_unroll_hint({value})))") + else: + return Line("__attribute__((opencl_unroll_hint))") + def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): from pymbolic.mapper.stringifier import PREC_NONE diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 92f4bbd96..40963a85e 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -1,3 +1,4 @@ +from __future__ import annotations """OpenCL target integrated with PyOpenCL.""" __copyright__ = "Copyright (C) 2015 Andreas Kloeckner" @@ -23,7 +24,7 @@ """ from warnings import warn -from typing import Sequence, Tuple, List, Union, Optional, cast +from typing import Sequence, Tuple, List, Union, Optional, cast, Any, TYPE_CHECKING import numpy as np import pymbolic.primitives as p @@ -34,8 +35,10 @@ from loopy.target.opencl import (OpenCLTarget, OpenCLCASTBuilder, ExpressionToOpenCLCExpressionMapper) +from loopy.target.pyopencl_execution import PyOpenCLExecutor from loopy.target.python import PythonASTBuilderBase from loopy.kernel import LoopKernel +from loopy.translation_unit import FunctionIdT, TranslationUnit from loopy.types import NumpyType from loopy.typing import ExpressionT from loopy.diagnostic import LoopyError, LoopyTypeError @@ -49,6 +52,9 @@ import logging logger = logging.getLogger(__name__) +if TYPE_CHECKING: + import pyopencl as cl + # {{{ pyopencl function scopers @@ -595,55 +601,22 @@ def alignment_requirement(self, type_decl): # }}} def get_kernel_executor_cache_key(self, queue, **kwargs): - import weakref - # Use weakref for CL context to avoid keeping context artifically alive - return (weakref.ref(queue.context), kwargs["entrypoint"]) - - def preprocess_translation_unit_for_passed_args(self, t_unit, epoint, - passed_args_dict): - - # {{{ ValueArgs -> GlobalArgs if passed as array shapes - - from loopy.kernel.data import ValueArg, GlobalArg - import pyopencl.array as cla - - knl = t_unit[epoint] - new_args = [] - - for arg in knl.args: - if isinstance(arg, ValueArg): - if (arg.name in passed_args_dict - and isinstance(passed_args_dict[arg.name], cla.Array) - and passed_args_dict[arg.name].shape == ()): - arg = GlobalArg(name=arg.name, dtype=arg.dtype, shape=(), - is_output=False, is_input=True) - - new_args.append(arg) - - knl = knl.copy(args=new_args) - - t_unit = t_unit.with_kernel(knl) - - # }}} - - return t_unit - - def get_kernel_executor(self, program, queue, **kwargs): - from loopy.target.pyopencl_execution import PyOpenCLKernelExecutor - - epoint = kwargs.pop("entrypoint") - program = self.preprocess_translation_unit_for_passed_args(program, - epoint, - kwargs) - - return PyOpenCLKernelExecutor(queue.context, program, - entrypoint=epoint) + return (queue.context, kwargs["entrypoint"]) + + # type-ignore because we're making things from *args: Any more concrete, + # and mypy doesn't like it. + def get_kernel_executor(self, t_unit: TranslationUnit, # type: ignore[override] + queue_or_context: Union[cl.CommandQueue, cl.Context], + *args: Any, entrypoint: FunctionIdT, **kwargs: Any + ) -> PyOpenCLExecutor: + from pyopencl import CommandQueue + if isinstance(queue_or_context, CommandQueue): + context = queue_or_context.context + else: + context = queue_or_context - def with_device(self, device): - from warnings import warn - warn("PyOpenCLTarget.with_device is deprecated, it will " - "stop working in 2022.", DeprecationWarning, stacklevel=2) - return self + from loopy.target.pyopencl_execution import PyOpenCLExecutor + return PyOpenCLExecutor(context, t_unit, entrypoint=entrypoint) # }}} diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index dd253a223..b65bdc66e 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -34,10 +34,9 @@ from loopy.typing import ExpressionT from loopy.kernel import LoopKernel from loopy.kernel.data import ArrayArg -from loopy.translation_unit import TranslationUnit from loopy.schedule.tools import KernelArgInfo from loopy.target.execution import ( - KernelExecutorBase, ExecutionWrapperGeneratorBase) + ExecutorBase, ExecutionWrapperGeneratorBase) import logging logger = logging.getLogger(__name__) @@ -270,7 +269,6 @@ def get_arg_pass(self, arg): @dataclass(frozen=True) class _KernelInfo: - t_unit: TranslationUnit cl_kernels: "_Kernels" invoker: Callable[..., Any] @@ -281,7 +279,7 @@ class _Kernels: # {{{ kernel executor -class PyOpenCLKernelExecutor(KernelExecutorBase): +class PyOpenCLExecutor(ExecutorBase): """An object connecting a kernel to a :class:`pyopencl.Context` for execution. @@ -303,10 +301,9 @@ def get_wrapper_generator(self): @memoize_method def translation_unit_info( - self, entrypoint: str, + self, arg_to_dtype: Optional[Map[str, LoopyType]] = None) -> _KernelInfo: - t_unit = self.get_typed_and_scheduled_translation_unit( - entrypoint, arg_to_dtype) + t_unit = self.get_typed_and_scheduled_translation_unit(arg_to_dtype) # FIXME: now just need to add the types to the arguments from loopy.codegen import generate_code_v2 @@ -315,19 +312,21 @@ def translation_unit_info( dev_code = codegen_result.device_code() - if t_unit[entrypoint].options.write_code: + if t_unit[self.entrypoint].options.write_code: #FIXME: redirect to "translation unit" level option as well. output = dev_code - if self.t_unit[entrypoint].options.allow_terminal_colors: + if self.t_unit[self.entrypoint].options.allow_terminal_colors: output = get_highlighted_code(output) - if self.t_unit[entrypoint].options.write_code is True: + if self.t_unit[self.entrypoint].options.write_code is True: print(output) else: - with open(self.t_unit[entrypoint].options.write_code, "w") as outf: + with open( + self.t_unit[self.entrypoint].options.write_code, "w" + ) as outf: outf.write(output) - if t_unit[entrypoint].options.edit_code: + if t_unit[self.entrypoint].options.edit_code: #FIXME: redirect to "translation unit" level option as well. from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.cl") @@ -337,19 +336,18 @@ def translation_unit_info( #FIXME: redirect to "translation unit" level option as well. cl_program = ( cl.Program(self.context, dev_code) - .build(options=t_unit[entrypoint].options.build_options)) + .build(options=t_unit[self.entrypoint].options.build_options)) cl_kernels = _Kernels() for dp in cl_program.kernel_names.split(";"): setattr(cl_kernels, dp, getattr(cl_program, dp)) return _KernelInfo( - t_unit=t_unit, cl_kernels=cl_kernels, - invoker=self.get_invoker(t_unit, entrypoint, codegen_result)) + invoker=self.get_invoker(t_unit, self.entrypoint, codegen_result)) def __call__(self, queue, *, - allocator=None, wait_for=None, out_host=None, entrypoint=None, + allocator=None, wait_for=None, out_host=None, **kwargs): """ :arg allocator: a callable passed a byte count and returning @@ -377,19 +375,13 @@ def __call__(self, queue, *, of the returned arrays. """ - assert entrypoint is not None - - # FIXME: Remove entrypoint argument - assert entrypoint == self.entrypoint - if __debug__: self.check_for_required_array_arguments(kwargs.keys()) if self.packing_controller is not None: kwargs = self.packing_controller(kwargs) - translation_unit_info = self.translation_unit_info(entrypoint, - self.arg_to_dtype(kwargs)) + translation_unit_info = self.translation_unit_info(self.arg_to_dtype(kwargs)) return translation_unit_info.invoker( translation_unit_info.cl_kernels, queue, allocator, wait_for, diff --git a/loopy/target/python.py b/loopy/target/python.py index f9cc06147..f93d2b44e 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -226,12 +226,15 @@ def ast_block_scope_class(self): return Collection def emit_sequential_loop(self, codegen_state, iname, iname_dtype, - lbound, ubound, inner): + lbound, ubound, inner, hints): ecm = codegen_state.expression_to_code_mapper from pymbolic.mapper.stringifier import PREC_NONE, PREC_SUM from genpy import For + if hints: + raise ValueError("hints for python loops not supported") + return For( (iname,), "range(%s, %s + 1)" diff --git a/loopy/tools.py b/loopy/tools.py index 2b64e7325..ca4256b0d 100644 --- a/loopy/tools.py +++ b/loopy/tools.py @@ -20,6 +20,7 @@ THE SOFTWARE. """ +from typing import List import collections.abc as abc from functools import cached_property @@ -27,7 +28,8 @@ import islpy as isl import numpy as np from pytools import memoize_method, ProcessLogger -from pytools.persistent_dict import KeyBuilder as KeyBuilderBase +from pytools.persistent_dict import ( + KeyBuilder as KeyBuilderBase, WriteOncePersistentDict) from loopy.symbolic import (UncachedWalkMapper as LoopyWalkMapper, RuleAwareIdentityMapper) from pymbolic.mapper.persistent_hash import ( @@ -101,11 +103,6 @@ def update_for_dict(self, key_hash, key): update_for_defaultdict = update_for_dict - def update_for_frozenset(self, key_hash, key): - for set_key in sorted(key, - key=lambda obj: type(obj).__name__ + str(obj)): - self.rec(key_hash, set_key) - def update_for_BasicSet(self, key_hash, key): # noqa from islpy import Printer prn = Printer.to_str(key.get_ctx()) @@ -134,7 +131,7 @@ def __init__(self, expression): self.expression = expression def __eq__(self, other): - return (type(self) == type(other) + return (type(self) is type(other) and self.expression == other.expression) def __ne__(self, other): @@ -862,7 +859,7 @@ def t_unit_to_python(t_unit, var_name="t_unit", .callables_table)) for name, clbl in t_unit.callables_table.items() if isinstance(clbl, CallableKernel)} - t_unit = t_unit.copy(callables_table=new_callables) + t_unit = t_unit.copy(callables_table=Map(new_callables)) knl_python_code_srcs = [_kernel_to_python(clbl.subkernel, name in t_unit.entrypoints, @@ -892,6 +889,18 @@ def t_unit_to_python(t_unit, var_name="t_unit", # }}} +# {{{ cache management + +caches: List[WriteOncePersistentDict] = [] + + +def clear_in_mem_caches() -> None: + for cache in caches: + cache.clear_in_mem_cache() + +# }}} + + # {{{ memoize_on_disk def memoize_on_disk(func, key_builder_t=LoopyKeyBuilder): @@ -909,6 +918,8 @@ def memoize_on_disk(func, key_builder_t=LoopyKeyBuilder): f"-v0-{DATA_MODEL_VERSION}"), key_builder=key_builder_t()) + caches.append(transform_cache) + @wraps(func) def wrapper(*args, **kwargs): from loopy import CACHING_ENABLED diff --git a/loopy/transform/array_buffer_map.py b/loopy/transform/array_buffer_map.py index 3c4092b74..f04719c77 100644 --- a/loopy/transform/array_buffer_map.py +++ b/loopy/transform/array_buffer_map.py @@ -21,16 +21,23 @@ """ +from dataclasses import dataclass, replace +from abc import ABC, abstractmethod +from typing import Optional, Callable, Sequence, Tuple, Any +from typing_extensions import Self import islpy as isl from islpy import dim_type from loopy.symbolic import (get_dependencies, SubstitutionMapper) from pymbolic.mapper.substitutor import make_subst_func -from pytools import ImmutableRecord, memoize_method +from pytools import memoize_method from pymbolic import var +from loopy.typing import ExpressionT -class AccessDescriptor(ImmutableRecord): + +@dataclass(frozen=True) +class AccessDescriptor: """ .. attribute:: identifier @@ -38,10 +45,11 @@ class AccessDescriptor(ImmutableRecord): to the access that generated it. Any Python value. """ - __slots__ = [ - "identifier", - "storage_axis_exprs", - ] + identifier: Any = None + storage_axis_exprs: Optional[Sequence[ExpressionT]] = None + + def copy(self, **kwargs) -> Self: + return replace(self, **kwargs) def to_parameters_or_project_out(param_inames, set_inames, set): @@ -62,9 +70,12 @@ def to_parameters_or_project_out(param_inames, set_inames, set): # {{{ construct storage->sweep map -def build_per_access_storage_to_domain_map(storage_axis_exprs, domain, - storage_axis_names, - prime_sweep_inames): +def build_per_access_storage_to_domain_map( + storage_axis_exprs: Sequence[ExpressionT], + domain: isl.BasicSet, + storage_axis_names: Sequence[str], + prime_sweep_inames: Callable[[ExpressionT], ExpressionT] + ) -> isl.BasicMap: map_space = domain.space stor_dim = len(storage_axis_names) @@ -124,10 +135,8 @@ def move_to_par_from_out(s2smap, except_inames): return s2smap -def build_global_storage_to_sweep_map(kernel, access_descriptors, - domain_dup_sweep, dup_sweep_index, - storage_axis_names, - sweep_inames, primed_sweep_inames, prime_sweep_inames): +def build_global_storage_to_sweep_map(access_descriptors, + domain_dup_sweep, storage_axis_names, prime_sweep_inames): # The storage map goes from storage axes to the domain. # The first len(arg_names) storage dimensions are the rule's arguments. @@ -192,7 +201,23 @@ def compute_bounds(kernel, domain, stor2sweep, # {{{ array-to-buffer map -class ArrayToBufferMap: +class ArrayToBufferMapBase(ABC): + non1_storage_axis_names: Tuple[str, ...] + storage_base_indices: Tuple[ExpressionT, ...] + non1_storage_shape: Tuple[ExpressionT, ...] + non1_storage_axis_flags: Tuple[ExpressionT, ...] + + @abstractmethod + def is_access_descriptor_in_footprint(self, accdesc: AccessDescriptor) -> bool: + ... + + @abstractmethod + def augment_domain_with_sweep(self, domain, new_non1_storage_axis_names, + boxify_sweep=False): + ... + + +class ArrayToBufferMap(ArrayToBufferMapBase): def __init__(self, kernel, domain, sweep_inames, access_descriptors, storage_axis_count): self.kernel = kernel @@ -221,10 +246,10 @@ def __init__(self, kernel, domain, sweep_inames, access_descriptors, # # }}} self.stor2sweep = build_global_storage_to_sweep_map( - kernel, access_descriptors, - domain_dup_sweep, dup_sweep_index, + access_descriptors, + domain_dup_sweep, storage_axis_names, - sweep_inames, self.primed_sweep_inames, self.prime_sweep_inames) + self.prime_sweep_inames) storage_base_indices, storage_shape = compute_bounds( kernel, domain, self.stor2sweep, self.primed_sweep_inames, @@ -298,7 +323,7 @@ def __init__(self, kernel, domain, sweep_inames, access_descriptors, self.non1_storage_axis_flags = non1_storage_axis_flags self.aug_domain = aug_domain self.storage_base_indices = storage_base_indices - self.non1_storage_shape = non1_storage_shape + self.non1_storage_shape = tuple(non1_storage_shape) def augment_domain_with_sweep(self, domain, new_non1_storage_axis_names, boxify_sweep=False): @@ -336,7 +361,8 @@ def augment_domain_with_sweep(self, domain, new_non1_storage_axis_names, else: return convexify(domain) - def is_access_descriptor_in_footprint(self, accdesc): + def is_access_descriptor_in_footprint(self, accdesc: AccessDescriptor) -> bool: + assert accdesc.storage_axis_exprs is not None return self._is_access_descriptor_in_footprint_inner( tuple(accdesc.storage_axis_exprs)) @@ -399,17 +425,20 @@ def _is_access_descriptor_in_footprint_inner(self, storage_axis_exprs): aligned_g_s2s_parm_dom) -class NoOpArrayToBufferMap: +class NoOpArrayToBufferMap(ArrayToBufferMapBase): non1_storage_axis_names = () storage_base_indices = () non1_storage_shape = () - def is_access_descriptor_in_footprint(self, accdesc): + def is_access_descriptor_in_footprint(self, accdesc: AccessDescriptor) -> bool: # no index dependencies--every reference to the subst rule # is necessarily in the footprint. return True + def augment_domain_with_sweep(self, domain, new_non1_storage_axis_names, + boxify_sweep=False): + return domain # }}} # vim: foldmethod=marker diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py index b3fc69671..b77c6a5ed 100644 --- a/loopy/transform/buffer.py +++ b/loopy/transform/buffer.py @@ -20,6 +20,7 @@ THE SOFTWARE. """ +from immutables import Map from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap, AccessDescriptor) from loopy.symbolic import (get_dependencies, @@ -524,7 +525,7 @@ def buffer_array(program, *args, **kwargs): new_callables[func_id] = clbl - return program.copy(callables_table=new_callables) + return program.copy(callables_table=Map(new_callables)) # vim: foldmethod=marker diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py index 3b239dfc7..33196ca67 100644 --- a/loopy/transform/callable.py +++ b/loopy/transform/callable.py @@ -21,6 +21,7 @@ """ import islpy as isl +from immutables import Map from pytools import UniqueNameGenerator @@ -105,12 +106,12 @@ def merge(translation_units): callables_table = {} for trans_unit in translation_units: - callables_table.update(trans_unit.callables_table.copy()) + callables_table.update(trans_unit.callables_table) return TranslationUnit( entrypoints=frozenset().union(*( t.entrypoints or frozenset() for t in translation_units)), - callables_table=callables_table, + callables_table=Map(callables_table), target=translation_units[0].target) @@ -576,7 +577,7 @@ def rename_callable(program, old_name, new_name=None, existing_ok=False): new_entrypoints = ((new_entrypoints | frozenset([new_name])) - frozenset([old_name])) - return program.copy(callables_table=new_callables_table, + return program.copy(callables_table=Map(new_callables_table), entrypoints=new_entrypoints) # }}} diff --git a/loopy/transform/data.py b/loopy/transform/data.py index e2c21e0dc..1b97087b8 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -24,16 +24,16 @@ from dataclasses import dataclass, replace -from typing import Optional, Tuple, Dict +from typing import Optional, Tuple, Dict, cast import numpy as np - +from immutables import Map from islpy import dim_type from pytools import MovedFunctionDeprecationWrapper from loopy.diagnostic import LoopyError -from loopy.kernel.data import ImageArg, auto, TemporaryVariable +from loopy.kernel.data import AddressSpace, ImageArg, auto, TemporaryVariable from loopy.types import LoopyType from loopy.typing import ExpressionT @@ -146,15 +146,10 @@ def _process_footprint_subscripts(kernel, rule_name, sweep_inames, # }}} -class _not_provided: # noqa: N801 - pass - - def add_prefetch_for_single_kernel(kernel, callables_table, var_name, sweep_inames=None, dim_arg_names=None, - # "None" is a valid value here, distinct from the default. - default_tag=_not_provided, + default_tag=None, rule_name=None, temporary_name=None, @@ -414,7 +409,7 @@ def add_prefetch(program, *args, **kwargs): new_callables[func_id] = in_knl_callable - return program.copy(callables_table=new_callables) + return program.copy(callables_table=Map(new_callables)) # }}} @@ -1002,7 +997,8 @@ def allocate_temporaries_for_base_storage(kernel: LoopKernel, vng = kernel.get_var_name_generator() - name_aspace_dtype_to_bsi: Dict[Tuple[str, int, LoopyType], _BaseStorageInfo] = {} + name_aspace_dtype_to_bsi: Dict[ + Tuple[str, AddressSpace, LoopyType], _BaseStorageInfo] = {} for tv in sorted( kernel.temporary_variables.values(), @@ -1037,7 +1033,8 @@ def allocate_temporaries_for_base_storage(kernel: LoopKernel, # FIXME: Could use approximate values of ValueArgs approx_array_nbytes = 0 - bs_key = (tv.base_storage, tv.address_space, tv.dtype) + bs_key = (tv.base_storage, + cast(AddressSpace, tv.address_space), tv.dtype) bsi = name_aspace_dtype_to_bsi.get(bs_key) if bsi is None or ( diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py index 2fd39cfc2..fe0bddcf3 100644 --- a/loopy/transform/fusion.py +++ b/loopy/transform/fusion.py @@ -449,7 +449,7 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None): new_callables[result.name] = CallableKernel(result) - return TranslationUnit(callables_table=new_callables, + return TranslationUnit(callables_table=Map(new_callables), target=result.target, entrypoints=frozenset([result.name])) diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index 673920364..51f970253 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -35,6 +35,7 @@ from loopy.kernel import LoopKernel from loopy.kernel.function_interface import CallableKernel +from typing import Optional __doc__ = """ .. currentmodule:: loopy @@ -2368,7 +2369,7 @@ def add_inames_for_unused_hw_axes(kernel, within=None): @for_each_kernel @remove_any_newly_unused_inames def rename_inames(kernel, old_inames, new_iname, existing_ok=False, - within=None, raise_on_domain_mismatch: bool = __debug__): + within=None, raise_on_domain_mismatch: Optional[bool] = None): r""" :arg old_inames: A collection of inames that must be renamed to **new_iname**. :arg within: a stack match as understood by @@ -2396,6 +2397,9 @@ def rename_inames(kernel, old_inames, new_iname, existing_ok=False, raise LoopyError("old_inames contains nested inames" " -- renaming is illegal.") + if raise_on_domain_mismatch is None: + raise_on_domain_mismatch = __debug__ + # sort to have deterministic implementation. old_inames = sorted(old_inames) @@ -2504,18 +2508,23 @@ def does_insn_involve_iname(kernel, insn, *args): @for_each_kernel def rename_iname(kernel, old_iname, new_iname, existing_ok=False, - within=None, preserve_tags=True): - """ + within=None, preserve_tags=True, + raise_on_domain_mismatch: Optional[bool] = None): + r""" Single iname version of :func:`loopy.rename_inames`. - :arg existing_ok: execute even if *new_iname* already exists + :arg existing_ok: execute even if *new_iname* already exists. :arg within: a stack match understood by :func:`loopy.match.parse_stack_match`. - :arg preserve_tags: copy the tags on the old iname to the new iname + :arg preserve_tags: copy the tags on the old iname to the new iname. + :arg raise_on_domain_mismatch: If *True*, raises an error if + :math:`\exists (i_1,i_2) \in \{\text{old\_inames}\}^2 | + \mathcal{D}_{i_1} \neq \mathcal{D}_{i_2}`. """ from itertools import product from loopy import tag_inames tags = kernel.inames[old_iname].tags - kernel = rename_inames(kernel, [old_iname], new_iname, existing_ok, within) + kernel = rename_inames(kernel, [old_iname], new_iname, existing_ok, + within, raise_on_domain_mismatch) if preserve_tags: kernel = tag_inames(kernel, product([new_iname], tags)) return kernel diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py index daf9316cd..6a39986a3 100644 --- a/loopy/transform/pack_and_unpack_args.py +++ b/loopy/transform/pack_and_unpack_args.py @@ -20,6 +20,7 @@ THE SOFTWARE. """ +from immutables import Map from loopy.diagnostic import LoopyError from loopy.kernel.instruction import CallInstruction from loopy.translation_unit import TranslationUnit @@ -335,6 +336,6 @@ def pack_and_unpack_args_for_call(program, *args, **kwargs): new_callables[func_id] = in_knl_callable - return program.copy(callables_table=new_callables) + return program.copy(callables_table=Map(new_callables)) # vim: foldmethod=marker diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index aab295741..a3f0a5dd5 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -21,26 +21,35 @@ """ -import numpy as np +from dataclasses import dataclass +from typing import FrozenSet, List, Mapping, Optional, Sequence, Type, Union +from immutables import Map import islpy as isl +from pytools.tag import Tag +from loopy.kernel import LoopKernel +from loopy.typing import ExpressionT, auto, not_none +from loopy.match import ToStackMatchCovertible from loopy.symbolic import (get_dependencies, RuleAwareIdentityMapper, RuleAwareSubstitutionMapper, SubstitutionRuleMappingContext, CombineMapper) from loopy.diagnostic import LoopyError from pymbolic.mapper.substitutor import make_subst_func from loopy.translation_unit import TranslationUnit -from loopy.kernel.instruction import MultiAssignmentBase -from loopy.kernel.function_interface import CallableKernel, ScalarCallable +from loopy.kernel.instruction import InstructionBase, MultiAssignmentBase +from loopy.kernel.function_interface import (CallableKernel, InKernelCallable, + ScalarCallable) from loopy.kernel.tools import (kernel_has_global_barriers, find_most_recent_global_barrier) from loopy.kernel.data import AddressSpace -from loopy.types import LoopyType +from loopy.types import LoopyType, ToLoopyTypeConvertible, to_loopy_type from pymbolic import var from pytools import memoize_on_first_arg -from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap, - AccessDescriptor) +from loopy.transform.array_buffer_map import (ArrayToBufferMap, + ArrayToBufferMapBase, + NoOpArrayToBufferMap, + AccessDescriptor) # {{{ contains_subst_rule_invocation @@ -105,15 +114,16 @@ def contains_a_subst_rule_invocation(kernel, insn): # }}} +@dataclass(frozen=True) class RuleAccessDescriptor(AccessDescriptor): - __slots__ = ["args", "expansion_stack"] + args: Optional[Sequence[ExpressionT]] = None def access_descriptor_id(args, expansion_stack): return (args, expansion_stack) -def storage_axis_exprs(storage_axis_sources, args): +def storage_axis_exprs(storage_axis_sources, args) -> Sequence[ExpressionT]: result = [] for saxis_source in storage_axis_sources: @@ -140,7 +150,7 @@ def __init__(self, rule_mapping_context, kernel, subst_name, subst_tag, within): self.subst_tag = subst_tag self.within = within - self.access_descriptors = [] + self.access_descriptors: List[RuleAccessDescriptor] = [] def map_substitution(self, name, tag, arguments, expn_state): process_me = name == self.subst_name @@ -347,23 +357,26 @@ def map_kernel(self, kernel): # }}} -class _not_provided: # noqa: N801 - pass - - -def precompute_for_single_kernel(kernel, callables_table, subst_use, - sweep_inames=None, within=None, storage_axes=None, temporary_name=None, - precompute_inames=None, precompute_outer_inames=None, +def precompute_for_single_kernel( + kernel: LoopKernel, + callables_table: Mapping[str, InKernelCallable], subst_use, + sweep_inames=None, + within: ToStackMatchCovertible = None, + *, + storage_axes=None, + temporary_name: Optional[str] = None, + precompute_inames: Optional[Sequence[str]] = None, + precompute_outer_inames: Optional[FrozenSet[str]] = None, storage_axis_to_tag=None, - # "None" is a valid value here, distinct from the default. - default_tag=_not_provided, + default_tag: Union[None, Tag, str] = None, - dtype=None, - fetch_bounding_box=False, - temporary_address_space=None, - compute_insn_id=None, - **kwargs): + dtype: Optional[ToLoopyTypeConvertible] = None, + fetch_bounding_box: bool = False, + temporary_address_space: Union[AddressSpace, None, Type[auto]] = None, + compute_insn_id: Optional[str] = None, + _enable_mirgecom_workaround: bool = False, + ) -> LoopKernel: """Precompute the expression described in the substitution rule determined by *subst_use* and store it in a temporary array. A precomputation needs two things to operate, a list of *sweep_inames* (order irrelevant) and an @@ -432,11 +445,8 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, May also be specified as a comma-separated string. :arg default_tag: The :ref:`iname tag ` to be applied to the - inames created to perform the precomputation. The current default will - make them local axes and automatically split them to fit the work - group size, but this default will disappear in favor of simply leaving them - untagged in 2019. For 2018, a warning will be issued if no *default_tag* is - specified. + inames created to perform the precomputation. By default, new + inames remain untagged. :arg dtype: The dtype of the temporary variable to precompute the result in. Can be either a dtype as understood by :class:`numpy.dtype` or @@ -452,23 +462,6 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are eliminated. """ - if isinstance(kernel, TranslationUnit): - kernel_names = [i for i, clbl in - kernel.callables_table.items() if isinstance(clbl, - CallableKernel)] - if len(kernel_names) != 1: - raise LoopyError() - - return kernel.with_kernel(precompute(kernel[kernel_names[0]], - subst_use, sweep_inames, within, storage_axes, temporary_name, - precompute_inames, precompute_outer_inames, storage_axis_to_tag, - default_tag, dtype, fetch_bounding_box, temporary_address_space, - compute_insn_id, kernel.callables_table, **kwargs)) - - if kwargs: - raise TypeError("unrecognized keyword arguments: %s" - % ", ".join(kwargs.keys())) - # {{{ check, standardize arguments if sweep_inames is None: @@ -502,7 +495,7 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, footprint_generators = None - subst_name = None + subst_name: Optional[str] = None subst_tag = None from pymbolic.primitives import Variable, Call @@ -540,6 +533,8 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, from loopy.match import parse_stack_match within = parse_stack_match(within) + assert subst_name is not None + try: subst = kernel.substitutions[subst_name] except KeyError: @@ -548,38 +543,11 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, c_subst_name = subst_name.replace(".", "_") - # {{{ handle default_tag - - from loopy.transform.data import _not_provided \ - as transform_data_not_provided - - if default_tag is _not_provided or default_tag is transform_data_not_provided: - # no need to warn for scalar precomputes - if sweep_inames: - from warnings import warn - warn( - "Not specifying default_tag is deprecated, and default_tag " - "will become mandatory in 2019.x. " - "Pass 'default_tag=\"l.auto\" to match the current default, " - "or Pass 'default_tag=None to leave the loops untagged, which " - "is the recommended behavior.", - DeprecationWarning, stacklevel=( - - # In this case, we came here through add_prefetch. Increase - # the stacklevel. - 3 if default_tag is transform_data_not_provided - - else 2)) - - default_tag = "l.auto" - from loopy.kernel.data import parse_tag default_tag = parse_tag(default_tag) # }}} - # }}} - # {{{ process invocations in footprint generators, start access_descriptors if footprint_generators: @@ -632,6 +600,8 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, expanding_usage_arg_deps = set() for accdesc in access_descriptors: + assert accdesc.args is not None + for arg in accdesc.args: expanding_usage_arg_deps.update( get_dependencies(arg) & kernel.all_inames()) @@ -684,8 +654,8 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, prior_storage_axis_name_dict = {} - storage_axis_names = [] - storage_axis_sources = [] # number for arg#, or iname + storage_axis_names: List[str] = [] + storage_axis_sources: List[Union[str, int]] = [] # number for arg#, or iname # {{{ check for pre-existing precompute_inames @@ -726,8 +696,11 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, storage_axis_names.append(name) if name not in preexisting_precompute_inames: - new_iname_to_tag[name] = storage_axis_to_tag.get( - tag_lookup_saxis, default_tag) + iname_tag = storage_axis_to_tag.get(tag_lookup_saxis, None) + if iname_tag is None: + iname_tag = default_tag + if iname_tag is not None: + new_iname_to_tag[name] = iname_tag prior_storage_axis_name_dict[name] = old_name @@ -770,7 +743,8 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, # }}} - abm = ArrayToBufferMap(kernel, domch.domain, sweep_inames, + abm: ArrayToBufferMapBase = ArrayToBufferMap( + kernel, domch.domain, sweep_inames, access_descriptors, len(storage_axis_names)) non1_storage_axis_names = [] @@ -778,7 +752,8 @@ def precompute_for_single_kernel(kernel, callables_table, subst_use, if abm.non1_storage_axis_flags[i]: non1_storage_axis_names.append(saxis) else: - del new_iname_to_tag[saxis] + if saxis in new_iname_to_tag: + del new_iname_to_tag[saxis] if saxis in preexisting_precompute_inames: raise LoopyError("precompute axis %d (1-based) was " @@ -911,14 +886,38 @@ def add_assumptions(d): storage_axis_subst_dict = {} - for arg_name, bi in zip(storage_axis_names, abm.storage_base_indices): - if arg_name in non1_storage_axis_names: - arg = var(arg_name) - else: + for i, (arg_name, base_index) in enumerate( + zip(storage_axis_names, abm.storage_base_indices)): + is_length_1 = arg_name not in non1_storage_axis_names + if is_length_1: arg = 0 + else: + arg = var(arg_name) + + # FIXME: Hacky workaround, remove when no longer needed. + # Some transform code in the mirgecom transform stack + # first deletes inames from instructions if they're unused and then + # gets upset when they've disappeared. Without this 'special handling' + # here, this code will replace 0-length axis subscripts with '0', as it + # should. + + if _enable_mirgecom_workaround: + from pymbolic.primitives import Expression + if is_length_1 and not isinstance(base_index, Expression): + # I.e. base_index is an integer. + from pytools import is_single_valued + if is_single_valued( + not_none(accdesc.storage_axis_exprs)[i] + for accdesc in access_descriptors): + assert access_descriptors[0].storage_axis_exprs is not None + storage_axis_expr = access_descriptors[0].storage_axis_exprs[i] + if not (get_dependencies(storage_axis_expr) & sweep_inames_set): + # I.e. no sweeping in this axis. + base_index = storage_axis_expr storage_axis_subst_dict[ - prior_storage_axis_name_dict.get(arg_name, arg_name)] = arg+bi + prior_storage_axis_name_dict.get(arg_name, arg_name)] = \ + arg+base_index rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) @@ -944,7 +943,7 @@ def add_assumptions(d): # within_inames determined below ) compute_dep_id = compute_insn_id - added_compute_insns = [compute_insn] + added_compute_insns: List[InstructionBase] = [compute_insn] if temporary_address_space == AddressSpace.GLOBAL: barrier_insn_id = kernel.make_unique_instruction_id( @@ -976,7 +975,7 @@ def add_assumptions(d): kernel = invr.map_kernel(kernel) kernel = kernel.copy( - instructions=added_compute_insns + kernel.instructions) + instructions=added_compute_insns + list(kernel.instructions)) kernel = rule_mapping_context.finish_kernel(kernel) # }}} @@ -1011,8 +1010,11 @@ def add_assumptions(d): .with_transformed_expressions( lambda expr: expr_subst_map(expr, kernel, insn)) # noqa: B023,E501 .copy(within_inames=frozenset( - storage_axis_subst_dict.get(iname, var(iname)).name - for iname in insn.within_inames))) + new_iname + for iname in insn.within_inames + for new_iname in get_dependencies( + storage_axis_subst_dict.get(iname, var(iname))) + ))) new_insns.append(insn) else: @@ -1051,19 +1053,19 @@ def add_assumptions(d): # {{{ set up temp variable import loopy as lp - if dtype is not None: - dtype = np.dtype(dtype) + + loopy_type = to_loopy_type(dtype, allow_none=True) if temporary_address_space is None: temporary_address_space = lp.auto new_temp_shape = tuple(abm.non1_storage_shape) - new_temporary_variables = kernel.temporary_variables.copy() + new_temporary_variables = dict(kernel.temporary_variables) if temporary_name not in new_temporary_variables: temp_var = lp.TemporaryVariable( name=temporary_name, - dtype=dtype, + dtype=loopy_type, base_indices=(0,)*len(new_temp_shape), shape=tuple(abm.non1_storage_shape), address_space=temporary_address_space, @@ -1087,6 +1089,7 @@ def add_assumptions(d): temp_var = temp_var.copy(dtype=dtype) + assert isinstance(temp_var.shape, tuple) if len(temp_var.shape) != len(new_temp_shape): raise LoopyError("Existing and new temporary '%s' do not " "have matching number of dimensions ('%d' vs. '%d') " @@ -1157,6 +1160,6 @@ def precompute(program, *args, **kwargs): new_callables[func_id] = clbl - return program.copy(callables_table=new_callables) + return program.copy(callables_table=Map(new_callables)) # vim: foldmethod=marker diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py index 2cbf21fec..3851bbdeb 100644 --- a/loopy/transform/privatize.py +++ b/loopy/transform/privatize.py @@ -364,6 +364,8 @@ def unprivatize_temporaries_with_inames( var_name_to_remove_indices = ir.var_name_to_remove_indices + from loopy.kernel.array import VectorArrayDimTag + new_temp_vars = kernel.temporary_variables.copy() for tv_name, tv in new_temp_vars.items(): remove_indices = var_name_to_remove_indices.get(tv_name, {}) @@ -374,6 +376,8 @@ def unprivatize_temporaries_with_inames( new_dim_tags = tv.dim_tags if new_dim_tags is not None: + new_dim_tags = ["vec" if isinstance(dim_tag, VectorArrayDimTag) else "c" + for idim, dim_tag in enumerate(new_dim_tags)] new_dim_tags = tuple(dim for idim, dim in enumerate(new_dim_tags) if idim not in remove_indices) diff --git a/loopy/transform/realize_reduction.py b/loopy/transform/realize_reduction.py index b8ddabbbc..c211ab18e 100644 --- a/loopy/transform/realize_reduction.py +++ b/loopy/transform/realize_reduction.py @@ -36,7 +36,7 @@ import islpy as isl from pymbolic.primitives import Expression -from pyrsistent import PMap +from immutables import Map from loopy.kernel.data import make_assignment from loopy.symbolic import ReductionCallbackMapper @@ -90,7 +90,7 @@ class _ReductionRealizationContext: domains: List[isl.BasicSet] additional_iname_tags: Dict[str, Sequence[Tag]] # list only to facilitate mutation - boxed_callables_table: List[PMap] + boxed_callables_table: List[Map] # FIXME: This is a broken-by-design concept. Local-parallel scans emit a # reduction internally. This serves to avoid force_scan acting on that @@ -2168,6 +2168,6 @@ def realize_reduction(t_unit, *args, **kwargs): subkernel=new_knl) callables_table[knl.name] = in_knl_callable - return t_unit.copy(callables_table=callables_table) + return t_unit.copy(callables_table=Map(callables_table)) # vim: foldmethod=marker diff --git a/loopy/translation_unit.py b/loopy/translation_unit.py index a81732135..39fdb2275 100644 --- a/loopy/translation_unit.py +++ b/loopy/translation_unit.py @@ -1,3 +1,5 @@ +from __future__ import annotations + __copyright__ = "Copyright (C) 2018 Kaushik Kulkarni" __license__ = """ @@ -21,22 +23,29 @@ """ import collections +from collections.abc import Set as abc_Set +from dataclasses import field, dataclass, replace +from typing import FrozenSet, Optional, TYPE_CHECKING, Mapping, Callable, Union, Any +from warnings import warn -from pytools import ImmutableRecord from pymbolic.primitives import Variable from functools import wraps from loopy.symbolic import (RuleAwareIdentityMapper, ResolvedFunction, SubstitutionRuleMappingContext) from loopy.kernel.function_interface import ( - CallableKernel, ScalarCallable) -from loopy.diagnostic import LoopyError + CallableKernel, InKernelCallable, ScalarCallable) +from loopy.diagnostic import LoopyError, DirectCallUncachedWarning from loopy.library.reduction import ReductionOpFunction from loopy.kernel import LoopKernel -from loopy.tools import update_persistent_hash +from loopy.target import TargetBase from pymbolic.primitives import Call -from pyrsistent import pmap, PMap +from immutables import Map + +if TYPE_CHECKING: + from loopy.target.execution import ExecutorBase + __doc__ = """ .. currentmodule:: loopy.translation_unit @@ -127,7 +136,11 @@ def map_call_with_kwargs(self, expr): # {{{ translation unit -class TranslationUnit(ImmutableRecord): +FunctionIdT = Union[str, ReductionOpFunction] + + +@dataclass(frozen=True) +class TranslationUnit: """ Records the information about all the callables in a :mod:`loopy` program. @@ -162,6 +175,7 @@ class TranslationUnit(ImmutableRecord): TargetBase, function_indentifier: str)`` that returns an instance of :class:`loopy.kernel.function_interface.InKernelCallable` or *None*. + .. automethod:: executor .. automethod:: __call__ .. automethod:: copy .. automethod:: __getitem__ @@ -176,47 +190,21 @@ class TranslationUnit(ImmutableRecord): :meth:`~TranslationUnit.copy`. """ - def __init__(self, - entrypoints=frozenset(), - callables_table=None, - target=None, - func_id_to_in_knl_callable_mappers=None): - - # {{{ sanity checks - - if callables_table is None: - callables_table = pmap() - if func_id_to_in_knl_callable_mappers is None: - func_id_to_in_knl_callable_mappers = [] - assert isinstance(callables_table, collections.abc.Mapping) - assert isinstance(entrypoints, frozenset) + callables_table: Map[FunctionIdT, CallableKernel] + target: TargetBase + entrypoints: FrozenSet[str] - if not isinstance(callables_table, PMap): - callables_table = pmap(callables_table) + def __post_init__(self): - # }}} - - super().__init__( - entrypoints=entrypoints, - callables_table=pmap(callables_table), - target=target, - func_id_to_in_knl_callable_mappers=( - func_id_to_in_knl_callable_mappers)) - - self._program_executor_cache = {} - self._hash_value = None + assert isinstance(self.entrypoints, abc_Set) + assert isinstance(self.callables_table, Map) - hash_fields = ( - "entrypoints", - "callables_table", - "target",) - - update_persistent_hash = update_persistent_hash + object.__setattr__(self, "_program_executor_cache", {}) def copy(self, **kwargs): target = kwargs.pop("target", None) - program = super().copy(**kwargs) + program = replace(self, **kwargs) if target: from loopy.kernel import KernelState if max(callable_knl.subkernel.state @@ -239,8 +227,8 @@ def copy(self, **kwargs): raise NotImplementedError() new_callables[func_id] = clbl - program = super().copy( - callables_table=new_callables, target=target) + program = replace( + self, callables_table=Map(new_callables), target=target) return program @@ -253,7 +241,7 @@ def with_entrypoints(self, entrypoints): entrypoints = frozenset([e.strip() for e in entrypoints.split(",")]) - assert isinstance(entrypoints, frozenset) + assert isinstance(entrypoints, abc_Set) return self.copy(entrypoints=entrypoints) @@ -279,7 +267,7 @@ def with_kernel(self, kernel): # update the callable kernel new_in_knl_callable = self.callables_table[kernel.name].copy( subkernel=kernel) - new_callables = self.callables_table.remove(kernel.name).set( + new_callables = self.callables_table.delete(kernel.name).set( kernel.name, new_in_knl_callable) return self.copy(callables_table=new_callables) else: @@ -310,6 +298,47 @@ def default_entrypoint(self): " The default entrypoint kernel is not uniquely" " determined.") + def executor(self, + *args, entrypoint: Optional[str] = None, **kwargs) -> ExecutorBase: + """Return an object that hosts caches of compiled code for execution (i.e. + a subclass of :class:`ExecutorBase`, specific to an execution + environment (e.g. an OpenCL context) and a given entrypoint. + + :arg entrypoint: The name of the entrypoint callable to be called. + Defaults to :attr:`default_entrypoint`. + An error will result if multiple entrypoints exist and no + entrypoint is specified. + + The variable arguments to this are target-specific. The + :class:`PyOpenCLTarget` takes a :class:`~pyopencl.Context` or a + :class:`~pyopencl.CommandQueue`. + """ + if entrypoint is None: + nentrypoints = len(self.entrypoints) + if nentrypoints == 1: + entrypoint, = self.entrypoints + elif nentrypoints > 1: + raise ValueError("TranslationUnit has multiple possible entrypoints." + " The default entrypoint kernel is not uniquely" + " determined. You may explicitly specify an " + " entrypoint using the 'entrypoint' kwarg.") + elif nentrypoints == 0: + raise ValueError("TranslationUnit has no entrypoints, but" + f" {len(self.callables_table)} callables." + " Use TranslationUnit.with_entrypoints to" + " set an entrypoint.") + else: + raise AssertionError + else: + if entrypoint not in self.entrypoints: + raise LoopyError(f"'{entrypoint}' not in list of possible " + "entrypoints for the translation unit. " + "Maybe you want to invoke 'with_entrypoints' before " + "calling the translation unit?") + + return self.target.get_kernel_executor(self, *args, + entrypoint=entrypoint, **kwargs) + def __call__(self, *args, **kwargs): """ Builds and calls the *entrypoint* kernel, if @@ -317,8 +346,33 @@ def __call__(self, *args, **kwargs): :arg entrypoint: The name of the entrypoint callable to be called. Defaults to :attr:`default_entrypoint`. + + .. warning:: + + While this was the main execution interface for loopy for many + years (and reasonably efficient), the caches that made this so + kept lots of expensive 'stuff' (such as OpenCL contexts) alive + for no good reason, leading to major inefficiencies. + See :meth:`executor` for an efficient, cached way to + invoke kernels. """ + + # The rationale for this is that the executor cache held long-lived + # references to OpenCL contexts, and translation units were kept alive + # long-term by caches, leading to many stale contexts being kept alive. + # While attempts were made to turn those into weak references, this was + # ultimately cumbersome and ineffective. + # + # In addition, the executor interface speeds up kernel invocation + # by removing one unnecessary layer of function call. + warn("TranslationUnit.__call__ will become uncached in 2024, " + "meaning it will incur possibly substantial compilation cost " + "with every invocation. Use TranslationUnit.executor to obtain " + "an object that holds longer-lived caches.", + DirectCallUncachedWarning, stacklevel=2) + entrypoint = kwargs.get("entrypoint", None) + if entrypoint is None: nentrypoints = len(self.entrypoints) if nentrypoints == 1: @@ -346,38 +400,38 @@ def __call__(self, *args, **kwargs): key = self.target.get_kernel_executor_cache_key(*args, **kwargs) try: - pex = self._program_executor_cache[key] + pex = self._program_executor_cache[key] # pylint: disable=no-member except KeyError: pex = self.target.get_kernel_executor(self, *args, **kwargs) - self._program_executor_cache[key] = pex + self._program_executor_cache[key] = pex # pylint: disable=no-member + + del kwargs["entrypoint"] return pex(*args, **kwargs) def __str__(self): # FIXME: do a topological sort by the call graph - def strify_callable(clbl): - return str(clbl.subkernel) - return "\n".join( - strify_callable(clbl) + str(clbl.subkernel) for name, clbl in self.callables_table.items() if isinstance(clbl, CallableKernel)) - def __setstate__(self, state_obj): - super().__setstate__(state_obj) + # FIXME: Delete these when _program_executor_cache leaves the building + def __getstate__(self): + from dataclasses import asdict + return asdict(self) - self._program_executor_cache = {} + def __setstate__(self, state_obj): + for k, v in state_obj.items(): + object.__setattr__(self, k, v) - def __hash__(self): - # NOTE: _hash_value may vanish during pickling - if getattr(self, "_hash_value", None) is None: - from loopy.tools import LoopyKeyBuilder - key_hash = LoopyKeyBuilder.new_hash() - self.update_persistent_hash(key_hash, LoopyKeyBuilder()) - self._hash_value = hash(key_hash.digest()) + object.__setattr__(self, "_program_executor_cache", {}) - return self._hash_value + # FIXME: This is here because Firedrake expects it, for some legacy reason. + # Without that, it would be safe to delete. + def update_persistent_hash(self, key_hash, key_builder): + key_builder.update_for_dataclass(key_hash, self) # }}} @@ -455,7 +509,8 @@ def make_clbl_inf_ctx(callables, entrypoints): return CallablesInferenceContext(callables, name_gen) -class CallablesInferenceContext(ImmutableRecord): +@dataclass(frozen=True) +class CallablesInferenceContext: """ Helper class for housekeeping a :attr:`loopy.TranslationUnit.callables_table` while traversing through callables of :class:`loopy.TranslationUnit`. @@ -480,18 +535,14 @@ class CallablesInferenceContext(ImmutableRecord): .. automethod:: __getitem__ """ - def __init__(self, callables, - clbl_name_gen, - renames=None, - new_entrypoints=frozenset()): - if renames is None: - renames = collections.defaultdict(frozenset) - assert isinstance(callables, collections.abc.Mapping) - - super().__init__(callables=dict(callables), - clbl_name_gen=clbl_name_gen, - renames=renames, - new_entrypoints=new_entrypoints) + callables: Mapping[str, InKernelCallable] + clbl_name_gen: Callable[[str], str] + renames: Mapping[str, FrozenSet[str]] = field( + default_factory=lambda: collections.defaultdict(frozenset)) + new_entrypoints: FrozenSet[str] = frozenset() + + def copy(self, **kwargs: Any) -> CallablesInferenceContext: + return replace(self, **kwargs) def with_callable(self, old_function_id, new_clbl, is_entrypoint=False): @@ -515,7 +566,7 @@ def with_callable(self, old_function_id, new_clbl, if isinstance(old_function_id, Variable): old_function_id = old_function_id.name - renames = self.renames.copy() + renames = collections.defaultdict(frozenset, self.renames) # if the callable already exists => return the function # identifier corresponding to that callable. @@ -556,7 +607,7 @@ def with_callable(self, old_function_id, new_clbl, # must allocate a new clbl in the namespace => find a unique id for it unique_function_id = self.clbl_name_gen(old_function_id) - updated_callables = self.callables.copy() + updated_callables = dict(self.callables) updated_callables[unique_function_id] = new_clbl renames[old_function_id] |= frozenset([unique_function_id]) @@ -642,7 +693,7 @@ def finish_program(self, program): # }}} - return program.copy(callables_table=new_callables) + return program.copy(callables_table=Map(new_callables)) def __getitem__(self, name): result = self.callables[name] @@ -653,18 +704,17 @@ def __getitem__(self, name): # {{{ helper functions -def make_program(kernel): +def make_program(kernel: LoopKernel) -> TranslationUnit: """ Returns an instance of :class:`loopy.TranslationUnit` with *kernel* as the only callable kernel. """ - program = TranslationUnit( - callables_table={ - kernel.name: CallableKernel(kernel)}, - target=kernel.target) - - return program + return TranslationUnit( + callables_table=Map({ + kernel.name: CallableKernel(kernel)}), + target=kernel.target, + entrypoints=frozenset()) def for_each_kernel(transform): @@ -696,7 +746,7 @@ def _collective_transform(*args, **kwargs): new_callables[func_id] = clbl - return t_unit.copy(callables_table=new_callables) + return t_unit.copy(callables_table=Map(new_callables)) else: assert isinstance(t_unit_or_kernel, LoopKernel) kernel = t_unit_or_kernel @@ -791,7 +841,7 @@ def resolve_callables(program): else: raise NotImplementedError(f"{type(clbl)}") - program = program.copy(callables_table=callables_table) + program = program.copy(callables_table=Map(callables_table)) validate_kernel_call_sites(program) diff --git a/loopy/types.py b/loopy/types.py index 57b9548bb..4c3b74ea6 100644 --- a/loopy/types.py +++ b/loopy/types.py @@ -22,10 +22,10 @@ THE SOFTWARE. """ -from typing import Any, Mapping -from warnings import warn +from typing import Any, Mapping, Type, Union import numpy as np +from loopy.typing import auto from loopy.diagnostic import LoopyError __doc__ = """ @@ -46,24 +46,24 @@ class LoopyType: Abstract class for dtypes of variables encountered in a :class:`loopy.LoopKernel`. """ - def is_integral(self): + def is_integral(self) -> bool: raise NotImplementedError() - def is_complex(self): + def is_complex(self) -> bool: raise NotImplementedError() - def uses_complex(self): + def uses_complex(self) -> bool: raise NotImplementedError() - def is_composite(self): + def is_composite(self) -> bool: raise NotImplementedError() @property - def itemsize(self): + def itemsize(self) -> int: raise NotImplementedError() @property - def numpy_dtype(self): + def numpy_dtype(self) -> np.dtype: raise ValueError("'%s' is not a numpy type" % str(self)) @@ -78,8 +78,8 @@ class AtomicType(LoopyType): # {{{ numpy-based dtype class NumpyType(LoopyType): - def __init__(self, dtype, target=None): - assert not isinstance(dtype, NumpyType) + def __init__(self, dtype: np.dtype): + assert not isinstance(dtype, LoopyType) if dtype is None: raise TypeError("may not pass None to construct NumpyType") @@ -87,33 +87,27 @@ def __init__(self, dtype, target=None): if dtype == object: raise TypeError("loopy does not directly support object arrays") - if target is not None: - warn("Passing target is deprecated and will stop working in 2022.", - DeprecationWarning, stacklevel=2) - self.dtype = np.dtype(dtype) - def __hash__(self): + def __hash__(self) -> int: return hash(self.dtype) def update_persistent_hash(self, key_hash, key_builder): key_builder.rec(key_hash, self.dtype) - def __eq__(self, other): + def __eq__(self, other: object) -> bool: return ( - type(self) == type(other) - and self.dtype == other.dtype) - - def __ne__(self, other): - return not self.__eq__(other) + type(self) is type(other) + # mypy doesn't understand 'type(self) is type(other)' + and self.dtype == other.dtype) # type: ignore[attr-defined] - def is_integral(self): + def is_integral(self) -> bool: return self.dtype.kind in "iu" - def is_complex(self): + def is_complex(self) -> bool: return self.dtype.kind == "c" - def involves_complex(self): + def involves_complex(self) -> bool: def dtype_involves_complex(dtype): if dtype.kind == "c": return True @@ -131,14 +125,14 @@ def is_composite(self): return self.dtype.kind == "V" @property - def itemsize(self): + def itemsize(self) -> int: return self.dtype.itemsize @property - def numpy_dtype(self): + def numpy_dtype(self) -> np.dtype: return self.dtype - def __repr__(self): + def __repr__(self) -> str: return "np:" + repr(self.dtype) # }}} @@ -171,43 +165,42 @@ class OpaqueType(LoopyType): through one ValueArg and go out to another. It is introduced to accomodate functional calls to external libraries. """ - def __init__(self, name): + def __init__(self, name: str) -> None: assert isinstance(name, str) self.name = name - def is_integral(self): + def is_integral(self) -> bool: return False - def is_complex(self): + def is_complex(self) -> bool: return False - def involves_complex(self): + def involves_complex(self) -> bool: return False def update_persistent_hash(self, key_hash, key_builder): key_builder.rec(key_hash, self.name) - def __hash__(self): + def __hash__(self) -> int: return hash(self.name) - def __eq__(self, other): + def __eq__(self, other: object) -> bool: return ( - type(self) == type(other) - and self.name == other.name) - - def __ne__(self, other): - return not self.__eq__(other) + type(self) is type(other) + # mypy doesn't understand 'type(self) is type(other)' + and self.name == other.name # type: ignore[attr-defined] + ) # }}} -def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False, - target=None): - if target is not None: - warn("Passing target is deprecated and will stop working in 2022.", - DeprecationWarning, stacklevel=2) +ToLoopyTypeConvertible = Union[Type[auto], None, np.dtype, LoopyType] + - from loopy.kernel.data import auto +def to_loopy_type(dtype: ToLoopyTypeConvertible, + allow_auto: bool = False, allow_none: bool = False, + for_atomic: bool = False + ) -> Union[Type[auto], None, LoopyType]: if dtype is None: if allow_none: return None @@ -216,7 +209,8 @@ def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False, elif dtype is auto: if allow_auto: - return dtype + # mypy doesn't seem to catch that this narrows the type of dtype + return dtype # type: ignore[return-value] else: raise LoopyError("dtype may not be auto") @@ -224,7 +218,9 @@ def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False, if dtype is not None: try: - numpy_dtype = np.dtype(dtype) + # We're playing fast and loose here, and mypy is onto us. It has a + # point. + numpy_dtype = np.dtype(dtype) # type: ignore except Exception: pass diff --git a/loopy/typing.py b/loopy/typing.py index d6714d870..5a20d2e0b 100644 --- a/loopy/typing.py +++ b/loopy/typing.py @@ -21,7 +21,7 @@ """ -from typing import Union, Tuple +from typing import Union, Tuple, TypeVar, Optional import numpy as np @@ -36,3 +36,18 @@ ExpressionT = Union[IntegralT, FloatT, Expression] ShapeType = Tuple[ExpressionT, ...] StridesType = ShapeType + + +class auto: # noqa + """A generic placeholder object for something that should be automatically + determined. See, for example, the *shape* or *strides* argument of + :class:`ArrayArg`. + """ + + +T = TypeVar("T") + + +def not_none(obj: Optional[T]) -> T: + assert obj is not None + return obj diff --git a/loopy/version.py b/loopy/version.py index 5372b5935..f66c24dee 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -42,7 +42,7 @@ # }}} -VERSION = (2022, 1) +VERSION = (2024, 1) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS diff --git a/setup.cfg b/setup.cfg index e3a8cc8bd..822df80d7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,6 +18,7 @@ per-file-ignores = [tool:pytest] doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS +addopts = --ignore=proto-tests --ignore=loopy/target/c/compyte/ndarray [mypy] python_version = 3.8 diff --git a/setup.py b/setup.py index a14cee92c..d9b8f6643 100644 --- a/setup.py +++ b/setup.py @@ -84,7 +84,7 @@ def write_git_revision(package_name): python_requires="~=3.8", install_requires=[ - "pytools>=2022.1.7", + "pytools>=2023.1.1", "pymbolic>=2022.1", "genpy>=2016.1.2", @@ -98,6 +98,7 @@ def write_git_revision(package_name): "Mako", "pyrsistent", "immutables", + "typing_extensions", ], extras_require={ diff --git a/test/test_c_execution.py b/test/test_c_execution.py index 4b5fdb1f8..11e336e0d 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -336,11 +336,6 @@ def eval_tester(knl): # the default (non-guessed) toolchain! __test(eval_tester, ExecutableCTarget, compiler=ccomp) - # and test that we will fail if we remove a required attribute - del ccomp.toolchain.undefines - with pytest.raises(AttributeError): - __test(eval_tester, ExecutableCTarget, compiler=ccomp) - # next test that some made up compiler can be specified ccomp = CCompiler(cc="foo") assert isinstance(ccomp.toolchain, GCCToolchain) diff --git a/test/test_einsum.py b/test/test_einsum.py index bada5c8c9..c3ed4ec98 100644 --- a/test/test_einsum.py +++ b/test/test_einsum.py @@ -26,6 +26,7 @@ import loopy as lp import numpy as np import pyopencl as cl +import pyopencl.array from pyopencl.tools import \ pytest_generate_tests_for_pyopencl as pytest_generate_tests # noqa @@ -140,6 +141,28 @@ def test_einsum_array_ops_triple_prod(ctx_factory, spec): assert np.linalg.norm(out - ans) <= 1e-15 +def test_einsum_with_variable_strides(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + spec = "ijk,jl->il" + knl = lp.make_einsum(spec, ("a", "b"), + default_order=lp.auto, default_offset=lp.auto) + + a_untransposed = np.random.randn(3, 5, 4) + b = np.random.randn(4, 5) + + a = a_untransposed.transpose((0, 2, 1)) + a_dev = cl.array.to_device(queue, a_untransposed).transpose((0, 2, 1)) + assert a_dev.strides == a.strides + + _evt, (result,) = knl(queue, a=a_dev, b=b) + + ref = np.einsum(spec, a, b) + + assert np.allclose(result.get(), ref) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_loopy.py b/test/test_loopy.py index 6a4079f00..2d1c7bc22 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1768,7 +1768,8 @@ def test_ilp_and_conditionals(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl) -def test_unr_and_conditionals(ctx_factory): +@pytest.mark.parametrize("unr_tag", ["unr", "unr_hint"]) +def test_unr_and_conditionals(ctx_factory, unr_tag): ctx = ctx_factory() knl = lp.make_kernel("{[k]: 0<=k t_unit = lp.make_kernel( "{[i,j,k]: 0<=i,j<72 and 0<=k<32}", @@ -3609,6 +3610,64 @@ def test_dgemm_with_rectangular_tile_prefetch(ctx_factory): lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit) +def test_modulo_vs_type_context(ctx_factory): + t_unit = lp.make_kernel( + "{[i]: 0 <= i < 10}", + """ + # previously, the float 'type context' would propagate into + # the remainder, leading to 'i % 10.0' being generated, which + # C/OpenCL did not like. + a = i % 10 + """) + + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + t_unit(queue) + + +def test_barrier_non_zero_hw_lbound(): + t_unit = lp.make_kernel( + ["{[i]: 1<=i<17}", + "{[j]: 0<=j<16}"], + """ + <> a[i] = i {id=w_a} + <> b[j] = 2*a[j] {id=w_b} + """) + + t_unit = lp.tag_inames(t_unit, {"i": "l.0", "j": "l.0"}) + + t_unit = lp.preprocess_kernel(t_unit) + knl = lp.get_one_linearized_kernel(t_unit.default_entrypoint, + t_unit.callables_table) + + assert barrier_between(knl, "w_a", "w_b") + + +def test_no_unnecessary_lbarrier(ctx_factory): + # This regression would fail on loopy.git <= 268a7f4 + # (Issue reported by @thilinarmtb) + + t_unit = lp.make_kernel( + "{[i_outer, i_inner]: 0 <= i_outer < n and 0 <= i_inner < 16}", + """ + <> s_a[i_inner] = ai[i_outer * 16 + i_inner] {id=write_s_a} + ao[i_outer * 16 + i_inner] = 2.0 * s_a[i_inner] {id=write_ao, dep=write_s_a} + """, + assumptions="n>=0") + + t_unit = lp.add_dtypes(t_unit, dict(ai=np.float32)) + t_unit = lp.tag_inames(t_unit, dict(i_inner="l.0", i_outer="g.0")) + t_unit = lp.set_temporary_address_space(t_unit, "s_a", "local") + t_unit = lp.prioritize_loops(t_unit, "i_outer,i_inner") + + t_unit = lp.preprocess_kernel(t_unit) + knl = lp.get_one_linearized_kernel(t_unit.default_entrypoint, + t_unit.callables_table) + + assert not barrier_between(knl, "write_s_a", "write_ao") + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_target.py b/test/test_target.py index 389c865b6..10a04ed5b 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -400,7 +400,7 @@ def test_nan_support(ctx_factory, target): lp.Assignment(parse("c"), parse("isnan(3.14)")), lp.Assignment(parse("d"), parse("isnan(0.0)")), lp.Assignment(parse("e"), NaN(np.float32)), - lp.Assignment(parse("f"), Variable("isnan")(NaN())), + lp.Assignment(parse("f"), Variable("isnan")(NaN(None))), lp.Assignment(parse("g"), NaN(np.complex64)), lp.Assignment(parse("h"), NaN(np.complex128)), ], @@ -560,22 +560,6 @@ def test_input_args_are_required(ctx_factory): _ = knl(queue) -def test_pyopencl_execution_accepts_device_scalars(ctx_factory): - import pyopencl.array as cla - - ctx = ctx_factory() - cq = cl.CommandQueue(ctx) - - knl = lp.make_kernel("{:}", - """ - y = 2*x - """) - - evt, (out,) = knl(cq, x=cla.to_device(cq, np.asarray(21))) - - np.testing.assert_allclose(out.get(), 42) - - def test_pyopencl_target_with_global_temps_with_base_storage(ctx_factory): from pyopencl.tools import ImmediateAllocator @@ -640,7 +624,7 @@ def test_glibc_bessel_functions(dtype): second_kind_bessel[i] = bessel_yn(n, x[i]) """, target=lp.ExecutableCWithGNULibcTarget(compiler)) - if knl.target.compiler.toolchain.cc not in ["gcc", "g++"]: + if knl.target.compiler.toolchain.cc not in ["gcc", "g++"]: # pylint: disable=no-member # noqa: E501 pytest.skip("GNU-libc not found.") knl = lp.fix_parameters(knl, n=2) diff --git a/test/test_transform.py b/test/test_transform.py index 1b56344e7..5ca01dea0 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -1668,6 +1668,38 @@ def test_remove_predicates_from_insn(): assert t_unit == ref_t_unit +def test_precompute_lets_length1_inames_live_if_requested(): + t_unit = lp.make_kernel( + "{[e,i]: 0<=e<1 and 0<=i<10}", + """ + v(e, i) := e + i + out[e, i] = v(e, i) + """) + + t_unit = lp.precompute(t_unit, "v", "i", _enable_mirgecom_workaround=True) + + from pymbolic import parse + assert t_unit.default_entrypoint.id_to_insn["v"].expression == parse("e + i_0") + + +def test_precompute_lets_inner_length1_inames_live(): + t_unit = lp.make_kernel( + "{[e,i]: 0<=e<1 and 0<=i<10}", + """ + v(e, i) := e / i + #v(eee, i) := eee + i + out[e, i] = v(e, i) + """) + + t_unit = lp.split_iname(t_unit, "e", 16) + t_unit = lp.precompute(t_unit, "v", "i", _enable_mirgecom_workaround=True) + + from pymbolic import parse + assert ( + t_unit.default_entrypoint.id_to_insn["v"].expression + == parse("(e_inner + e_outer*16) / i_0")) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])