Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implements CVectorExtensionsTarget #557

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
127 changes: 107 additions & 20 deletions examples/python/call-external.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from loopy.diagnostic import LoopyError
from loopy.target.c import CTarget
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa: F401
from loopy.target.c.c_execution import CCompiler
from codepy.toolchain import GCCToolchain


# {{{ blas callable
Expand All @@ -22,7 +24,7 @@ def with_types(self, arg_id_to_dtype, callables_table):

if vec_dtype.numpy_dtype == np.float32:
name_in_target = "cblas_sgemv"
elif vec_dtype. numpy_dtype == np.float64:
elif vec_dtype.numpy_dtype == np.float64:
name_in_target = "cblas_dgemv"
else:
raise LoopyError("GEMV is only supported for float32 and float64 "
Expand All @@ -47,30 +49,37 @@ def with_descrs(self, arg_id_to_descr, callables_table):
assert mat_descr.shape[0] == res_descr.shape[0]
assert len(vec_descr.shape) == len(res_descr.shape) == 1
# handling only the easy case when stride == 1
assert vec_descr.dim_tags[0].stride == 1
assert mat_descr.dim_tags[1].stride == 1
assert res_descr.dim_tags[0].stride == 1

return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table

def emit_call_insn(self, insn, target, expression_to_code_mapper):
from pymbolic import var
from loopy.codegen import UnvectorizableError
mat_descr = self.arg_id_to_descr[0]
vec_descr = self.arg_id_to_descr[1]
res_descr = self.arg_id_to_descr[-1]
m, n = mat_descr.shape
ecm = expression_to_code_mapper

if ecm.codegen_state.vectorization_info is not None:
raise UnvectorizableError("cannot vectorize BLAS-gemv.")

mat, vec = insn.expression.parameters
result, = insn.assignees

c_parameters = [var("CblasRowMajor"),
var("CblasNoTrans"),
m, n,
1,
1, # alpha
ecm(mat).expr,
1,
mat_descr.dim_tags[0].stride, # LDA
ecm(vec).expr,
1,
vec_descr.dim_tags[0].stride, # INCX
0, # beta
ecm(result).expr,
1]
res_descr.dim_tags[0].stride # INCY
]
return (var(self.name_in_target)(*c_parameters),
False # cblas_gemv does not return anything
)
Expand All @@ -83,17 +92,95 @@ def generate_preambles(self, target):
# }}}


n = 10

knl = lp.make_kernel(
"{:}",
def transform_1(knl):
return knl


def transform_2(knl):
# A similar transformation is applied to kernels containing
# SLATE <https://www.firedrakeproject.org/firedrake.slate.html>
# callables.
knl = lp.split_iname(knl, "e", 4, inner_iname="e_inner", slabs=(0, 1))
knl = lp.privatize_temporaries_with_inames(knl, "e_inner")
knl = lp.tag_inames(knl, {"e_inner": "vec"})
if 0:
# Easy codegen exercise, but misses vectorizing certain instructions.
knl = lp.tag_array_axes(knl, "tmp3", "c,vec")
else:
knl = lp.tag_array_axes(knl, "tmp3,tmp2", "c,vec")
return knl


def main():

compiler = CCompiler(toolchain=GCCToolchain(
cc="gcc",
cflags="-std=c99 -O3 -fPIC".split(),
ldflags="-shared".split(),
libraries=["blas"],
library_dirs=[],
defines=[],
undefines=[],
source_suffix="c",
so_ext=".so",
o_ext=".o",
include_dirs=[]))

knl = lp.make_kernel(
"{[e,i1,i2]: 0<=e<n and 0<=i1,i2<4}",
"""
y[:] = gemv(A[:, :], x[:])
""", [
lp.GlobalArg("A", dtype=np.float64, shape=(n, n)),
lp.GlobalArg("x", dtype=np.float64, shape=(n, )),
lp.GlobalArg("y", shape=(n, )), ...],
target=CTarget())

knl = lp.register_callable(knl, "gemv", CBLASGEMV(name="gemv"))
print(lp.generate_code_v2(knl).device_code())
for e
for i1
tmp1[i1] = 3*x[e, i1]
end
tmp2[:] = matvec(A[:, :], tmp1[:])
for i2
<> tmp3[i2] = 2 * tmp2[i2]
out[e, i2] = tmp3[i2]
end
end
""",
kernel_data=[
lp.TemporaryVariable("tmp1",
shape=(4, ),
dtype=None),
lp.TemporaryVariable("tmp2",
shape=(4, ),
dtype=None),
lp.GlobalArg("A",
shape=(4, 4),
dtype="float64"),
lp.GlobalArg("x",
shape=lp.auto,
dtype="float64"),
...],
target=lp.ExecutableCVectorExtensionsTarget(compiler=compiler),
lang_version=(2018, 2))

knl = lp.register_callable(knl, "matvec", CBLASGEMV("matvec"))

for transform_func in [transform_1, transform_2]:
knl = transform_func(knl)
print("Generated code from '{transform_func.__name__} -----'")
print(lp.generate_code_v2(knl).device_code())
print(75 * "-")

# {{ verify the result is correct.

from numpy.random import default_rng

rng = default_rng(seed=0)
a = rng.random((4, 4))
x = rng.random((100, 4))

_, (out,) = knl(A=a, x=x)

np.testing.assert_allclose(6*np.einsum("ij,ej->ei",
a, x),
out)

# }}}


if __name__ == "__main__":
main()
12 changes: 8 additions & 4 deletions loopy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@
AddressSpace,
TemporaryVariable,
SubstitutionRule,
CallMangleInfo)
CallMangleInfo,
VectorizeTag)
from loopy.kernel.function_interface import (
CallableKernel, ScalarCallable)
from loopy.translation_unit import (
Expand Down Expand Up @@ -150,10 +151,12 @@
from loopy.frontend.fortran import (c_preprocess, parse_transformed_fortran,
parse_fortran)

from loopy.target import TargetBase, ASTBuilderBase
from loopy.target import TargetBase, ASTBuilderBase, VectorizationFallback
from loopy.target.c import (CFamilyTarget, CTarget, ExecutableCTarget,
generate_header, CWithGNULibcTarget,
ExecutableCWithGNULibcTarget)
from loopy.target.c_vector_extensions import (CVectorExtensionsTarget,
ExecutableCVectorExtensionsTarget)
from loopy.target.cuda import CudaTarget
from loopy.target.opencl import OpenCLTarget
from loopy.target.pyopencl import PyOpenCLTarget
Expand Down Expand Up @@ -190,7 +193,7 @@
"AddressSpace",
"TemporaryVariable",
"SubstitutionRule",
"CallMangleInfo",
"CallMangleInfo", "VectorizeTag",

"make_kernel", "UniqueName", "make_function",

Expand Down Expand Up @@ -298,9 +301,10 @@

"LoopyError", "LoopyWarning",

"TargetBase",
"TargetBase", "VectorizationFallback",
"CFamilyTarget", "CTarget", "ExecutableCTarget", "generate_header",
"CWithGNULibcTarget", "ExecutableCWithGNULibcTarget",
"CVectorExtensionsTarget", "ExecutableCVectorExtensionsTarget",
"CudaTarget", "OpenCLTarget",
"PyOpenCLTarget", "ISPCTarget",
"ASTBuilderBase",
Expand Down
10 changes: 6 additions & 4 deletions loopy/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,13 +605,15 @@ def check_for_data_dependent_parallel_bounds(kernel):
Check that inames tagged as hw axes have bounds that are known at kernel
launch.
"""
from loopy.kernel.data import ConcurrentTag
from loopy.kernel.data import LocalInameTagBase, GroupInameTag

for i, dom in enumerate(kernel.domains):
dom_inames = set(dom.get_var_names(dim_type.set))
par_inames = {
iname for iname in dom_inames
if kernel.iname_tags_of_type(iname, ConcurrentTag)}
# do not check for vec-inames as their implementation is accompanied
# with a fallback machinery
par_inames = {iname for iname in dom_inames
if kernel.iname_tags_of_type(iname, (LocalInameTagBase,
GroupInameTag))}

if not par_inames:
continue
Expand Down
42 changes: 33 additions & 9 deletions loopy/codegen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,23 +281,47 @@ def try_vectorized(self, what, func):
return self.unvectorize(func)

def unvectorize(self, func):
from loopy.codegen.result import (merge_codegen_results,
CodeGenerationResult)
from loopy.target import VectorizationFallback

vinf = self.vectorization_info
assert vinf is not None

result = []
novec_self = self.copy(vectorization_info=None)

for i in range(vinf.length):
idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i
new_codegen_state = novec_self.fix(vinf.iname, idx_aff)
generated = func(new_codegen_state)

if isinstance(generated, list):
result.extend(generated)
if self.target.vectorization_fallback == VectorizationFallback.UNROLL:
for i in range(vinf.length):
idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i
new_codegen_state = novec_self.fix(vinf.iname, idx_aff)
generated = func(new_codegen_state)

if isinstance(generated, list):
result.extend(generated)
else:
result.append(generated)
elif self.target.vectorization_fallback == VectorizationFallback.OMP_SIMD:
astb = self.ast_builder
inner = func(novec_self)
if isinstance(inner, list):
inner = merge_codegen_results(novec_self, inner)
assert isinstance(inner, CodeGenerationResult)
if isinstance(inner.current_ast(novec_self),
astb.ast_comment_class):
# loop body is a comment => do not emit the loop
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is puzzling. Could you explain what leads to this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A noop instruction is emitted as a comment.

loop_cgr = inner
else:
result.append(generated)
result.append(astb.emit_pragma("omp simd"))
loop_cgr = inner.with_new_ast(
novec_self,
astb.emit_sequential_loop(
novec_self, vinf.iname, self.kernel.index_dtype,
0, vinf.length-1, inner.current_ast(novec_self)))
result.append(loop_cgr)
else:
raise NotImplementedError(self.target.vectorization_fallback)

from loopy.codegen.result import merge_codegen_results
return merge_codegen_results(self, result)

@property
Expand Down
7 changes: 7 additions & 0 deletions loopy/codegen/instruction.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,13 @@ def generate_assignment_instruction_code(codegen_state, insn):
raise UnvectorizableError(
"LHS is scalar, RHS is vector, cannot assign")

if (lhs_is_vector
and (not rhs_is_vector)
and (not
kernel.target.broadcasts_scalar_assignment_to_vec_types)):
raise UnvectorizableError(
"LHS is vector, RHS is not vector, cannot assign")

is_vector = lhs_is_vector

del lhs_is_vector
Expand Down
32 changes: 30 additions & 2 deletions loopy/codegen/loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,10 +160,25 @@ def generate_unroll_loop(codegen_state, sched_index):

# {{{ vectorized loops

def raise_for_unvectorizable_loop(codegen_state, sched_index):
kernel = codegen_state.kernel
raise RuntimeError(f"Cannot vectorize {kernel.schedule[sched_index]}")


def generate_vectorize_loop(codegen_state, sched_index):
from loopy.kernel.data import VectorizeTag
from loopy.target import VectorizationFallback
kernel = codegen_state.kernel

iname = kernel.linearization[sched_index].iname
vec_tag, = kernel.inames[iname].tags_of_type(VectorizeTag)

if kernel.target.vectorization_fallback == VectorizationFallback.UNROLL:
fallback_codegen_routine = generate_unroll_loop
elif kernel.target.vectorization_fallback == VectorizationFallback.OMP_SIMD:
fallback_codegen_routine = generate_openmp_simd_loop
else:
raise NotImplementedError(kernel.target.vectorization_fallback)

bounds = kernel.get_iname_bounds(iname, constants_only=True)

Expand All @@ -177,7 +192,7 @@ def generate_vectorize_loop(codegen_state, sched_index):
warn(kernel, "vec_upper_not_const",
"upper bound for vectorized loop '%s' is not a constant, "
"cannot vectorize--unrolling instead")
return generate_unroll_loop(codegen_state, sched_index)
return fallback_codegen_routine(codegen_state, sched_index)

length = int(pw_aff_to_expr(length_aff))

Expand All @@ -192,7 +207,7 @@ def generate_vectorize_loop(codegen_state, sched_index):
warn(kernel, "vec_lower_not_0",
"lower bound for vectorized loop '%s' is not zero, "
"cannot vectorize--unrolling instead")
return generate_unroll_loop(codegen_state, sched_index)
return fallback_codegen_routine(codegen_state, sched_index)

# {{{ 'implement' vectorization bounds

Expand Down Expand Up @@ -484,4 +499,17 @@ def generate_sequential_loop_dim_code(codegen_state, sched_index):

# }}}


# {{{ omp simd loop

def generate_openmp_simd_loop(codegen_state, sched_index):
return merge_codegen_results(
codegen_state,
[codegen_state.ast_builder.emit_pragma("omp simd"),
generate_sequential_loop_dim_code(codegen_state,
sched_index)])

# }}}


# vim: foldmethod=marker
Loading