diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 3b572e310..7a6c294fb 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -23,6 +23,7 @@ from pytools import ImmutableRecord import sys +import islpy as isl from loopy.diagnostic import warn_with_kernel, LoopyError # noqa from pytools import MinRecursionLimit, ProcessLogger @@ -214,24 +215,40 @@ def find_loop_nest_around_map(kernel): """Returns a dictionary mapping inames to other inames that are always nested around them. """ - from collections import defaultdict - from loopy.schedule.tools import get_loop_nest_tree + result = {} + + all_inames = kernel.all_inames() - tree = get_loop_nest_tree(kernel) + iname_to_insns = kernel.iname_to_insns() - loop_nest_around_map = defaultdict(frozenset) + # examine pairs of all inames--O(n**2), I know. + from loopy.kernel.data import IlpBaseTag + for inner_iname in all_inames: + result[inner_iname] = set() + for outer_iname in all_inames: + if inner_iname == outer_iname: + continue - for node in tree.all_nodes_itr(): - if node.identifier == tree.root: - continue - iname = node.identifier - depth = tree.depth(iname) - all_ancestors = frozenset(tree.ancestor(iname, d).identifier - for d in range(1, depth)) + if kernel.iname_tags_of_type(outer_iname, IlpBaseTag): + # ILP tags are special because they are parallel tags + # and therefore 'in principle' nest around everything. + # But they're realized by the scheduler as a loop + # at the innermost level, so we'll cut them some + # slack here. + continue + + if iname_to_insns[inner_iname] < iname_to_insns[outer_iname]: + result[inner_iname].add(outer_iname) + + for dom_idx, dom in enumerate(kernel.domains): + for outer_iname in dom.get_var_names(isl.dim_type.param): + if outer_iname not in all_inames: + continue - loop_nest_around_map[iname] = all_ancestors + for inner_iname in dom.get_var_names(isl.dim_type.set): + result[inner_iname].add(outer_iname) - return loop_nest_around_map + return result def find_loop_insn_dep_map(kernel, loop_nest_with_map, loop_nest_around_map): @@ -802,6 +819,10 @@ def _get_dep_equivalent_nests(tree, within1, within2): return iname1, iname2 +class V2SchedulerNotImplementedException(RuntimeError): + pass + + def generate_loop_schedules_v2(kernel): from loopy.schedule.tools import get_loop_nest_tree from functools import reduce @@ -809,10 +830,13 @@ def generate_loop_schedules_v2(kernel): from loopy.kernel.data import ConcurrentTag, IlpBaseTag, VectorizeTag if any(insn.priority != 0 for insn in kernel.instructions): - raise NotImplementedError + raise V2SchedulerNotImplementedException("v2 scheduler cannot schedule" + " kernels with instruction priorities set.") if kernel.schedule is not None: - raise NotImplementedError + # cannnot handle preschedule yet + raise V2SchedulerNotImplementedException("v2 scheduler cannot schedule" + " prescheduled kernels.") concurrent_inames = {iname for iname in kernel.all_inames() if kernel.iname_tags_of_type(iname, ConcurrentTag)} @@ -2074,154 +2098,147 @@ def generate_loop_schedules_inner(kernel, debug_args={}): from loopy.check import pre_schedule_checks pre_schedule_checks(kernel) - can_v2_scheduler_handle = ( - # v2-scheduler cannot handle insn groups - all(len(insn.conflicts_with_groups) == 0 - for insn in kernel.instructions) - # v2-scheduler cannot handle prescheduled kernel - and (not kernel.schedule) - # v2-scheduler cannot handle instruction priorities - and all(insn.priority == 0 - for insn in kernel.instructions) - ) - - if can_v2_scheduler_handle: + try: gen_sched = generate_loop_schedules_v2(kernel) yield postprocess_schedule(kernel, gen_sched) - else: - schedule_count = 0 - - debug = ScheduleDebugger(**debug_args) - - preschedule = (kernel.schedule - - if kernel.state == KernelState.LINEARIZED - - else ()) - - prescheduled_inames = { - insn.iname - for insn in preschedule - if isinstance(insn, EnterLoop)} - - prescheduled_insn_ids = { - insn_id - for item in preschedule - for insn_id in sched_item_to_insn_id(item)} - - from loopy.kernel.data import (IlpBaseTag, ConcurrentTag, VectorizeTag, - filter_iname_tags_by_type) - ilp_inames = { - name - for name, iname in kernel.inames.items() - if filter_iname_tags_by_type(iname.tags, IlpBaseTag)} - vec_inames = { - name - for name, iname in kernel.inames.items() - if filter_iname_tags_by_type(iname.tags, VectorizeTag)} - parallel_inames = { - name - for name, iname in kernel.inames.items() - if filter_iname_tags_by_type(iname.tags, ConcurrentTag)} - - loop_nest_with_map = find_loop_nest_with_map(kernel) - loop_nest_around_map = find_loop_nest_around_map(kernel) - sched_state = SchedulerState( - kernel=kernel, - loop_nest_around_map=loop_nest_around_map, - loop_insn_dep_map=find_loop_insn_dep_map( - kernel, - loop_nest_with_map=loop_nest_with_map, - loop_nest_around_map=loop_nest_around_map), - breakable_inames=ilp_inames, - ilp_inames=ilp_inames, - vec_inames=vec_inames, - - prescheduled_inames=prescheduled_inames, - prescheduled_insn_ids=prescheduled_insn_ids, - - # time-varying part - active_inames=(), - entered_inames=frozenset(), - enclosing_subkernel_inames=(), - - schedule=(), - - unscheduled_insn_ids={insn.id for insn in kernel.instructions}, - scheduled_insn_ids=frozenset(), - within_subkernel=kernel.state != KernelState.LINEARIZED, - may_schedule_global_barriers=True, - - preschedule=preschedule, - insn_ids_to_try=None, - - # ilp and vec are not parallel for the purposes of the scheduler - parallel_inames=parallel_inames - ilp_inames - vec_inames, - - group_insn_counts=group_insn_counts(kernel), - active_group_counts={}, - - insns_in_topologically_sorted_order=( - get_insns_in_topologically_sorted_order(kernel)), - ) - - schedule_gen_kwargs = {} - - def print_longest_dead_end(): - if debug.interactive: - print("Loopy will now show you the scheduler state at the point") - print("where the longest (dead-end) schedule was generated, in the") - print("the hope that some of this makes sense and helps you find") - print("the issue.") - print() - print("To disable this interactive behavior, pass") - print(" debug_args=dict(interactive=False)") - print("to generate_loop_schedules().") - print(75*"-") - input("Enter:") - print() - print() - - debug.debug_length = len(debug.longest_rejected_schedule) - while True: - try: - for _ in generate_loop_schedules_internal( - sched_state, debug=debug, **schedule_gen_kwargs): - pass - - except ScheduleDebugInput as e: - debug.debug_length = int(str(e)) - continue + return + except V2SchedulerNotImplementedException as e: + from warnings import warn + warn(f"Falling back to a slow scheduler implementation due to: {e}") - break + schedule_count = 0 - try: - for gen_sched in generate_loop_schedules_internal( - sched_state, debug=debug, **schedule_gen_kwargs): - debug.stop() + debug = ScheduleDebugger(**debug_args) + + preschedule = (kernel.schedule + + if kernel.state == KernelState.LINEARIZED + + else ()) + + prescheduled_inames = { + insn.iname + for insn in preschedule + if isinstance(insn, EnterLoop)} + + prescheduled_insn_ids = { + insn_id + for item in preschedule + for insn_id in sched_item_to_insn_id(item)} + + from loopy.kernel.data import (IlpBaseTag, ConcurrentTag, VectorizeTag, + filter_iname_tags_by_type) + ilp_inames = { + name + for name, iname in kernel.inames.items() + if filter_iname_tags_by_type(iname.tags, IlpBaseTag)} + vec_inames = { + name + for name, iname in kernel.inames.items() + if filter_iname_tags_by_type(iname.tags, VectorizeTag)} + parallel_inames = { + name + for name, iname in kernel.inames.items() + if filter_iname_tags_by_type(iname.tags, ConcurrentTag)} + + loop_nest_with_map = find_loop_nest_with_map(kernel) + loop_nest_around_map = find_loop_nest_around_map(kernel) + sched_state = SchedulerState( + kernel=kernel, + loop_nest_around_map=loop_nest_around_map, + loop_insn_dep_map=find_loop_insn_dep_map( + kernel, + loop_nest_with_map=loop_nest_with_map, + loop_nest_around_map=loop_nest_around_map), + breakable_inames=ilp_inames, + ilp_inames=ilp_inames, + vec_inames=vec_inames, + + prescheduled_inames=prescheduled_inames, + prescheduled_insn_ids=prescheduled_insn_ids, + + # time-varying part + active_inames=(), + entered_inames=frozenset(), + enclosing_subkernel_inames=(), + + schedule=(), + + unscheduled_insn_ids={insn.id for insn in kernel.instructions}, + scheduled_insn_ids=frozenset(), + within_subkernel=kernel.state != KernelState.LINEARIZED, + may_schedule_global_barriers=True, + + preschedule=preschedule, + insn_ids_to_try=None, - new_kernel = postprocess_schedule(kernel, gen_sched) - yield new_kernel + # ilp and vec are not parallel for the purposes of the scheduler + parallel_inames=parallel_inames - ilp_inames - vec_inames, - debug.start() + group_insn_counts=group_insn_counts(kernel), + active_group_counts={}, - schedule_count += 1 + insns_in_topologically_sorted_order=( + get_insns_in_topologically_sorted_order(kernel)), + ) - except KeyboardInterrupt: + schedule_gen_kwargs = {} + + def print_longest_dead_end(): + if debug.interactive: + print("Loopy will now show you the scheduler state at the point") + print("where the longest (dead-end) schedule was generated, in the") + print("the hope that some of this makes sense and helps you find") + print("the issue.") print() + print("To disable this interactive behavior, pass") + print(" debug_args=dict(interactive=False)") + print("to generate_loop_schedules().") print(75*"-") - print("Interrupted during scheduling") - print(75*"-") - print_longest_dead_end() - raise + input("Enter:") + print() + print() - debug.done_scheduling() - if not schedule_count: - print(75*"-") - print("ERROR: Sorry--loopy did not find a schedule for your kernel.") - print(75*"-") - print_longest_dead_end() - raise RuntimeError("no valid schedules found") + debug.debug_length = len(debug.longest_rejected_schedule) + while True: + try: + for _ in generate_loop_schedules_internal( + sched_state, debug=debug, **schedule_gen_kwargs): + pass + + except ScheduleDebugInput as e: + debug.debug_length = int(str(e)) + continue + + break + + try: + for gen_sched in generate_loop_schedules_internal( + sched_state, debug=debug, **schedule_gen_kwargs): + debug.stop() + + new_kernel = postprocess_schedule(kernel, gen_sched) + yield new_kernel + + debug.start() + + schedule_count += 1 + + except KeyboardInterrupt: + print() + print(75*"-") + print("Interrupted during scheduling") + print(75*"-") + print_longest_dead_end() + raise + + debug.done_scheduling() + if not schedule_count: + print(75*"-") + print("ERROR: Sorry--loopy did not find a schedule for your kernel.") + print(75*"-") + print_longest_dead_end() + raise RuntimeError("no valid schedules found") logger.info("%s: schedule done" % kernel.name) diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index 29b92dcfa..3bcb27af9 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -221,7 +221,10 @@ def _update_flow_requirements(priorities, cannot_satisfy_callback): # inner iname and outer iname are indirect family members # => must be realized via dependencies in the linearization # phase - raise NotImplementedError + from loopy.schedule import V2SchedulerNotImplementedException + raise V2SchedulerNotImplementedException("cannot" + " schedule kernels with priority dependencies" + " between sibling loop nests") def _raise_loopy_err(x): raise LoopyError(x)