diff --git a/.basedpyright/baseline.json b/.basedpyright/baseline.json index faf88f576..ec5651565 100644 --- a/.basedpyright/baseline.json +++ b/.basedpyright/baseline.json @@ -8261,8 +8261,8 @@ "code": "reportUnknownArgumentType", "range": { "startColumn": 56, - "endColumn": 17, - "lineCount": 8 + "endColumn": 63, + "lineCount": 1 } }, { @@ -8273,6 +8273,14 @@ "lineCount": 1 } }, + { + "code": "reportUnknownArgumentType", + "range": { + "startColumn": 52, + "endColumn": 59, + "lineCount": 1 + } + }, { "code": "reportUnknownMemberType", "range": { diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 0dda9ba0a..9a29a5f28 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -85,14 +85,23 @@ def generate_code_for_sched_index( glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs( get_insn_ids_for_block_at(kernel.linearization, sched_index), codegen_state.callables_table) - return merge_codegen_results(codegen_state, [ - codegen_result, + prefixes, suffixes = ( + codegen_state.ast_builder.get_temporary_decl_at_index( + codegen_state, sched_index + ) + ) + results = [ + prefixes, + codegen_result, codegen_state.ast_builder.get_kernel_call( codegen_state, sched_item.kernel_name, - glob_grid, loc_grid) - ]) + glob_grid, loc_grid), + suffixes + ] + results = [r for r in results if r is not None] + return merge_codegen_results(codegen_state, results) else: # do not generate host code for non-entrypoint kernels return codegen_result @@ -136,7 +145,14 @@ def generate_code_for_sched_index( "for '%s', tagged '%s'" % (sched_item.iname, ", ".join(str(tag) for tag in tags))) - return func(codegen_state, sched_index) + prefixes, suffixes = ( + codegen_state.ast_builder.get_temporary_decl_at_index( + codegen_state, sched_index + ) + ) + results = [prefixes, func(codegen_state, sched_index), suffixes] + results = [r for r in results if r is not None] + return merge_codegen_results(codegen_state, results) elif isinstance(sched_item, Barrier): # {{{ emit barrier code diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index 948dbf0f0..7d69aec9d 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -177,6 +177,137 @@ def supporting_temporary_names( return frozenset(result) + +def _get_temporaries_accessed_in_schedule( + kernel: LoopKernel, + sched_idx_lower_bound: int, + sched_idx_upper_bound: int + ) -> frozenset[str]: + from loopy.schedule import CallKernel, EnterLoop, LeaveLoop + + linearization = kernel.linearization + assert linearization is not None + + temporaries: frozenset[str] = frozenset() + for sched_index in range(sched_idx_lower_bound, sched_idx_upper_bound): + sched_item = linearization[sched_index] + if isinstance(sched_item, CallKernel): + temporaries = ( + temporaries_written_in_subkernel(kernel, sched_item.kernel_name) + | temporaries_read_in_subkernel( + kernel, sched_item.kernel_name + ) + | (temporaries) + ) + elif isinstance(sched_item, (EnterLoop, LeaveLoop)): + # ignore further outside-kernel loops + pass + + else: + raise NotImplementedError("kernel with non-CallKernel outermost") + + return temporaries + + +def _map_to_base_storage(kernel: LoopKernel, tv_names: Set[str]) -> Set[str]: + result: set[str] = set() + for tv_name in tv_names: + while True: + tv = kernel.temporary_variables[tv_name] + if tv.base_storage is not None: + tv_name = tv.base_storage + else: + break + + result.add(tv_name) + + return result + + +@memoize_on_first_arg +def get_sched_index_to_first_and_last_used( + kernel: LoopKernel + ) -> tuple[Mapping[int, Set[str]], Mapping[int, Set[str]]]: + """ + Returns the tuple (first_used, last_used), where first_used is + a dict such that first_used[sched_index] is the set of all global temporary + variable names first used at sched_index. + + Likewise, last_used[sched_index] is the set of all global temporary + variable names last used at sched_index. + """ + from loopy.kernel.data import AddressSpace + from loopy.schedule import CallKernel, EnterLoop, Barrier + + assert kernel.linearization is not None + + global_temporaries = frozenset( + tv.name for tv in kernel.temporary_variables.values() + if tv.address_space == AddressSpace.GLOBAL + ) + + # Collapse into blocks + block_boundaries = get_block_boundaries(kernel.linearization) + + tvs_accessed_at: dict[int, frozenset[str]] = {} + sched_index = 0 + while sched_index < len(kernel.linearization): + sched_item = kernel.linearization[sched_index] + if isinstance(sched_item, CallKernel): + block_end = block_boundaries[sched_index] + tvs_accessed_at[sched_index] = ( + temporaries_written_in_subkernel(kernel, sched_item.kernel_name) + | temporaries_read_in_subkernel( + kernel, sched_item.kernel_name + ) + ) & global_temporaries + + sched_index = block_end + 1 + + elif isinstance(sched_item, EnterLoop): + block_end = block_boundaries[sched_index] + tvs_accessed_at[sched_index] = _get_temporaries_accessed_in_schedule( + kernel, sched_index, block_end+1 + ) & global_temporaries + + sched_index = block_end + 1 + + elif isinstance(sched_item, Barrier): + sched_index += 1 + else: + raise ValueError( + f"unexpected schedule item at outermost level: {type(sched_item)}") + + storage_vars_accessed_at = { + sched_index: _map_to_base_storage(kernel, accessed) + for sched_index, accessed in tvs_accessed_at.items() + } + del tvs_accessed_at + + # forward pass for first_accesses + first_accesses: dict[int, Set[str]] = {} + seen_storage_vars: set[str] = set() + for sched_index in range(0, len(kernel.linearization)): + accessed = storage_vars_accessed_at.get(sched_index, set()) + new_storage_vars = accessed - seen_storage_vars + seen_storage_vars.update(accessed) + + if new_storage_vars: + first_accesses[sched_index] = new_storage_vars + + # backward pass for last_accesses + last_accesses: dict[int, Set[str]] = {} + seen_storage_vars = set() + for sched_index in range(len(kernel.linearization)-1, -1, -1): + accessed = storage_vars_accessed_at.get(sched_index, set()) + new_storage_vars = accessed - seen_storage_vars + seen_storage_vars.update(accessed) + + if new_storage_vars: + last_accesses[sched_index] = new_storage_vars + + return (first_accesses, last_accesses) + # }}} diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 673a46d4d..feb03a4d0 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -59,6 +59,7 @@ from loopy.codegen import CodeGenerationState, PreambleInfo from loopy.codegen.result import CodeGenerationResult from loopy.kernel import LoopKernel + from loopy.kernel.data import TemporaryVariable from loopy.target.c import DTypeRegistry from loopy.target.execution import ExecutorBase from loopy.translation_unit import CallableId, CallablesTable, TranslationUnit @@ -251,6 +252,27 @@ def get_temporary_decls(self, codegen_state: CodeGenerationState, schedule_index: int) -> ASTType: raise NotImplementedError + @abstractmethod + def get_temporary_var_declarator(self, + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> ASTType | None: + ... + + @abstractmethod + def get_temporary_var_deallocator(self, + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> ASTType | None: + ... + + @abstractmethod + def get_temporary_decl_at_index( + self, codegen_state: CodeGenerationState, + sched_index: int + ) -> tuple[ASTType | None, ASTType | None]: + ... + def get_kernel_call(self, codegen_state: CodeGenerationState, subkernel_name: str, gsize: tuple[Expression, ...], @@ -365,6 +387,27 @@ def get_expression_to_code_mapper(self, codegen_state): def get_kernel_call(self, codegen_state, name, gsize, lsize): return None + @override + def get_temporary_var_declarator( + self, codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> None: + return None + + @override + def get_temporary_var_deallocator( + self, codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> None: + return None + + @override + def get_temporary_decl_at_index( + self, codegen_state: CodeGenerationState, + sched_index: int + ) -> tuple[None, None]: + return (None, None) + @property def ast_block_class(self): return _DummyASTBlock diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 13026f61c..6b745eea2 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -37,6 +37,7 @@ from cgen import ( Block, Collection, + Comment, Const, Declarator, Generable, @@ -1109,6 +1110,12 @@ def get_temporary_decls(self, codegen_state, schedule_index): return result + @override + def get_temporary_decl_at_index( + self, codegen_state: CodeGenerationState, sched_index: int + ) -> tuple[Generable | None, Generable | None]: + return (None, None) + @property @override def ast_block_class(self): @@ -1242,6 +1249,7 @@ def arg_to_cgen_declarator( raise ValueError(f"unexpected type of argument '{passed_name}': " f"'{type(var_descr)}'") + @override def get_temporary_var_declarator(self, codegen_state: CodeGenerationState, temp_var: TemporaryVariable) -> Declarator: @@ -1274,6 +1282,12 @@ def get_temporary_var_declarator(self, return self.wrap_decl_for_address_space(temp_var_decl, temp_var.address_space) + @override + def get_temporary_var_deallocator(self, + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> Generable: + return Comment("Dynamic freeing of temp vars not supported") # }}} @override diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 9cb9a4e1d..acca1fb53 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -659,6 +659,7 @@ def get_kernel_executor(self, t_unit: TranslationUnit, # type: ignore[override] from loopy.target.pyopencl_execution import PyOpenCLExecutor return PyOpenCLExecutor(context, t_unit, entrypoint=entrypoint) + # }}} @@ -805,25 +806,15 @@ def get_function_definition( ["_lpy_cl_kernels", "queue", *kai.passed_arg_names, "wait_for=None", "allocator=None"]) - from genpy import For, Function, Line, Return, Statement as S, Suite + from genpy import Function, Line, Return, Suite return Function( codegen_result.current_program(codegen_state).name, args, Suite([ Line(), - ] + [ Line(), function_body, Line(), - ] + ([ - For("_tv", "_global_temporaries", - # Free global temporaries. - # Zero-size temporaries allocate as None, tolerate that. - # https://documen.tician.de/pyopencl/tools.html#pyopencl.tools.ImmediateAllocator - S("if _tv is not None: _tv.release()")) - ] if self._get_global_temporaries(codegen_state) else [] - ) + [ - Line(), Return("_lpy_evt"), ])) @@ -850,41 +841,69 @@ def get_temporary_decls(self, codegen_state: CodeGenerationState, schedule_index: int ): - from genpy import Assign, Comment, Line + return [] + + @override + def get_temporary_decl_at_index( + self, codegen_state: CodeGenerationState, sched_index: int + ) -> tuple[genpy.Generable | None, genpy.Generable | None]: + from loopy.schedule.tools import get_sched_index_to_first_and_last_used + first_accesses, last_accesses = get_sched_index_to_first_and_last_used( + codegen_state.kernel + ) + prefixes, suffixes = None, None + if sched_index in first_accesses: + prefix_lines: list[genpy.Generable] = [] + for tv_name in first_accesses[sched_index]: + prefix_lines.append( + self.get_temporary_var_declarator( + codegen_state, + codegen_state.kernel.temporary_variables[tv_name] + ) + ) + prefixes = self.ast_block_class(prefix_lines) + if sched_index in last_accesses: + suffix_lines: list[genpy.Generable] = [] + for tv_name in last_accesses[sched_index]: + suffix_lines.append( + self.get_temporary_var_deallocator( + codegen_state, + codegen_state.kernel.temporary_variables[tv_name] + ) + ) + suffixes = self.ast_block_class(suffix_lines) + return (prefixes, suffixes) + + @override + def get_temporary_var_declarator(self, + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> genpy.Generable: + from genpy import Assign, Suite from pymbolic.mapper.stringifier import PREC_NONE ecm = self.get_expression_to_code_mapper(codegen_state) - global_temporaries = self._get_global_temporaries(codegen_state) - if not global_temporaries: - return [] - - allocated_var_names: list[str] = [] - code_lines: list[genpy.Generable] = [] - code_lines.append(Line()) - code_lines.append(Comment("{{{ allocate global temporaries")) - code_lines.append(Line()) - - for tv in global_temporaries: - if not tv.base_storage: - if tv.nbytes: - # NB: This does not prevent all zero-size allocations, - # as sizes are parametric, and allocation size - # could turn out to be zero at runtime. - nbytes_str = ecm(tv.nbytes, PREC_NONE, type_context="i") - allocated_var_names.append(tv.name) - code_lines.append(Assign(tv.name, - f"allocator({nbytes_str})")) - else: - code_lines.append(Assign(tv.name, "None")) - - code_lines.append(Assign("_global_temporaries", "[{tvs}]".format( - tvs=", ".join(tv for tv in allocated_var_names)))) + if not temp_var.base_storage: + if temp_var.nbytes: + # NB: This does not prevent all zero-size allocations, + # as sizes are parametric, and allocation size + # could turn out to be zero at runtime. + nbytes_str = ecm(temp_var.nbytes, PREC_NONE, type_context="i") + return Assign(temp_var.name, f"allocator({nbytes_str})") + else: + return Assign(temp_var.name, "None") - code_lines.append(Line()) - code_lines.append(Comment("}}}")) - code_lines.append(Line()) + return Suite() - return code_lines + @override + def get_temporary_var_deallocator( + self, codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> genpy.Generable: + from genpy import Statement + # Zero-size temporaries allocate as None, tolerate that. + # https://documen.tician.de/pyopencl/tools.html#pyopencl.tools.ImmediateAllocator + return Statement(f"if {temp_var.name} is not None: {temp_var.name}.release()") def get_kernel_call( self, codegen_state: CodeGenerationState, diff --git a/loopy/target/python.py b/loopy/target/python.py index d1bc51f56..51a575871 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -34,7 +34,7 @@ from pymbolic.mapper.stringifier import PREC_NONE, StringifyMapper from loopy.diagnostic import LoopyError -from loopy.kernel.data import ValueArg +from loopy.kernel.data import TemporaryVariable, ValueArg from loopy.kernel.function_interface import ScalarCallable from loopy.target import ASTBuilderBase from loopy.type_inference import TypeReader @@ -339,7 +339,26 @@ def emit_assignment(self, codegen_state: CodeGenerationState, insn: Assignment): ecm(insn.assignee, prec=PREC_NONE, type_context=None), ecm(insn.expression, prec=PREC_NONE, type_context=None)) - # }}} + @override + def get_temporary_var_declarator(self, + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> Generable | None: + return None + + @override + def get_temporary_var_deallocator(self, + codegen_state: CodeGenerationState, + temp_var: TemporaryVariable + ) -> Generable | None: + return None + + @override + def get_temporary_decl_at_index( + self, codegen_state: CodeGenerationState, + sched_index: int + ) -> tuple[Generable | None, Generable | None]: + return None, None # }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index 2f66bf377..bd41ef92a 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -3693,6 +3693,60 @@ def test_long_kernel(): lp.get_one_linearized_kernel(t_unit.default_entrypoint, t_unit.callables_table) +def test_temporary_memory_allocation(ctx_factory: cl.CtxFactory): + from pyopencl.tools import ImmediateAllocator, MemoryPool + + ctx = ctx_factory() + cq = cl.CommandQueue(ctx) + n = 16 + + knl = lp.make_kernel( + "{ [i]: 0<=i b[i] = a[i] + ... gbarrier + <> c[i] = b[i] + 1 + ... gbarrier + <> d[i] = c[i] + 1 + ... gbarrier + <> e[i] = d[i] + 1 + ... gbarrier + <> f[i] = e[i] + 1 + ... gbarrier + <> g[i] = f[i] + 1 + ... gbarrier + <> h[i] = g[i] + 1 + ... gbarrier + <> j[i] = h[i] + 1 + ... gbarrier + <> k[i] = j[i] + 1 + ... gbarrier + <> l[i] = k[i] + 1 + ... gbarrier + <> m[i] = l[i] + 1 + ... gbarrier + out[i] = m[i] + end + """, seq_dependencies=True) + + knl = lp.add_and_infer_dtypes(knl, + {"a": np.float32}) + + temp_vars = list(knl.default_entrypoint.temporary_variables) + knl = lp.set_temporary_address_space(knl, temp_vars, "global") + + knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") + + mem_pool_alloc = MemoryPool(ImmediateAllocator(cq)) + + a = np.arange(n, dtype=np.float32) + knl(cq, a=a, allocator=mem_pool_alloc) + + # FIXME This relies on the memory pool not freeing any memory it allocates + assert mem_pool_alloc.managed_bytes < len(temp_vars) * a.nbytes + + @pytest.mark.filterwarnings("error:.*:loopy.LoopyWarning") def test_loop_imperfect_nest_priorities_in_v2_scheduler(): # Reported by Connor Ward. See .