Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
7b59f0f
Initial work on compiler profiling.
mcourteaux Mar 8, 2026
deea8f8
Refine lambda argument requirements in IRMutator and IRVisitor
alexreinking Mar 8, 2026
6bed833
Early exit in the loop-checking visitor
alexreinking Mar 8, 2026
4ea11aa
Compute last_use in-line
alexreinking Mar 8, 2026
2e9f117
Avoid redundant FindBufferUsage in For loop visitor
alexreinking Mar 8, 2026
9094ea8
fixup! Compute last_use in-line
alexreinking Mar 8, 2026
de8dc95
More profiling stuff.
mcourteaux Mar 8, 2026
def7be2
Fix build when not compiling in profiling.
mcourteaux Mar 8, 2026
dfc98d9
Disable RTTI naming when it's not enabled in the build config.
mcourteaux Mar 9, 2026
23071b2
Merge remote-tracking branch 'origin/alexreinking/inject-host-copies-…
mcourteaux Mar 9, 2026
b297402
Cleanup.
mcourteaux Mar 9, 2026
25aef21
Annotate InjectHostDevBufferCopies
mcourteaux Mar 9, 2026
cd1488a
Annotate Bounds and AddImageChecks
mcourteaux Mar 9, 2026
01cb49c
More annotating.
mcourteaux Mar 9, 2026
ec468a4
Clang-format and makefile fix, and support no RTTI.
mcourteaux Mar 9, 2026
f833bfd
Missing header in makefile.
mcourteaux Mar 9, 2026
865c601
Merge branch 'main' into compiler-profiling
mcourteaux Mar 14, 2026
2f3c0f3
Remove Profiled<...> from all mutators/visitors.
mcourteaux Mar 14, 2026
8c7aaed
Strip PerformanceCounter and use chrono instead, for simplicity.
mcourteaux Mar 14, 2026
c53b74b
Ditch profiled_xxx in favor of a simple call to operator()(...)
mcourteaux Mar 14, 2026
5738ed9
Change the main entry point of visitors and mutators to operator().
mcourteaux Mar 14, 2026
e6f9238
Clang-format
mcourteaux Mar 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ Halide_feature(WITH_DOCS "Halide's Doxygen documentation" OFF)
Halide_feature(WITH_PACKAGING "Halide's CMake package install rules" TOP_LEVEL)
Halide_feature(WITH_PYTHON_BINDINGS "Halide's native Python module (not the whole pip package)" ON
DEPENDS Halide_ENABLE_EXCEPTIONS AND Halide_ENABLE_RTTI)
Halide_feature(WITH_COMPILER_PROFILING "Enable internal compiler tracing" OFF)
Halide_feature(WITH_SERIALIZATION "Include experimental Serialization/Deserialization code" ON)
Halide_feature(WITH_SERIALIZATION_JIT_ROUNDTRIP_TESTING
"Intercepting JIT compilation with a serialization roundtrip, for test only"
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,7 @@ SOURCE_FILES = \
Pipeline.cpp \
Prefetch.cpp \
PrintLoopNest.cpp \
ProfiledIRVisitor.cpp \
Profiling.cpp \
PurifyIndexMath.cpp \
PythonExtensionGen.cpp \
Expand Down
10 changes: 7 additions & 3 deletions src/AddImageChecks.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "AddImageChecks.h"
#include "CompilerProfiling.h"
#include "ExternFuncArgument.h"
#include "Function.h"
#include "IRMutator.h"
Expand Down Expand Up @@ -103,6 +104,7 @@ class TrimStmtToPartsThatAccessBuffers : public IRMutator {
bool touches_buffer = false;
const map<string, FindBuffers::Result> &buffers;

protected:
using IRMutator::visit;

Expr visit(const Call *op) override {
Expand Down Expand Up @@ -165,7 +167,7 @@ Stmt add_image_checks_inner(Stmt s,
bool no_bounds_query = t.has_feature(Target::NoBoundsQuery);

// First hunt for all the referenced buffers
FindBuffers finder;
Profiled<FindBuffers> finder;
map<string, FindBuffers::Result> &bufs = finder.buffers;

// Add the output buffer(s).
Expand All @@ -188,7 +190,7 @@ Stmt add_image_checks_inner(Stmt s,
s.accept(&finder);

Scope<Interval> empty_scope;
Stmt sub_stmt = TrimStmtToPartsThatAccessBuffers(bufs).mutate(s);
Stmt sub_stmt = Profiled<TrimStmtToPartsThatAccessBuffers>(bufs).mutate(s);
map<string, Box> boxes = boxes_touched(sub_stmt, empty_scope, fb);

// Now iterate through all the buffers, creating a list of lets
Expand Down Expand Up @@ -737,6 +739,7 @@ Stmt add_image_checks(const Stmt &s,
// Checks for images go at the marker deposited by computation
// bounds inference.
class Injector : public IRMutator {
protected:
using IRMutator::visit;

Expr visit(const Variable *op) override {
Expand Down Expand Up @@ -794,7 +797,8 @@ Stmt add_image_checks(const Stmt &s,
bool will_inject_host_copies)
: outputs(outputs), t(t), order(order), env(env), fb(fb), will_inject_host_copies(will_inject_host_copies) {
}
} injector(outputs, t, order, env, fb, will_inject_host_copies);
};
Profiled<Injector> injector(outputs, t, order, env, fb, will_inject_host_copies);

return injector.mutate(s);
}
Expand Down
3 changes: 2 additions & 1 deletion src/AddParameterChecks.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "IRVisitor.h"
#include "Substitute.h"
#include "Target.h"
#include "CompilerProfiling.h"

namespace Halide {
namespace Internal {
Expand Down Expand Up @@ -35,7 +36,7 @@ class FindParameters : public IRGraphVisitor {
Stmt add_parameter_checks(const vector<Stmt> &preconditions, Stmt s, const Target &t) {

// First, find all the parameters
FindParameters finder;
Profiled<FindParameters> finder;
s.accept(&finder);

map<string, Expr> replace_with_constrained;
Expand Down
24 changes: 18 additions & 6 deletions src/AsyncProducers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ class NoOpCollapsingMutator : public IRMutator {
};

class GenerateProducerBody : public NoOpCollapsingMutator {
protected:
const string &func;
vector<Expr> sema;
std::set<string> producers_dropped;
Expand Down Expand Up @@ -285,6 +286,7 @@ class GenerateProducerBody : public NoOpCollapsingMutator {
};

class GenerateConsumerBody : public NoOpCollapsingMutator {
protected:
const string &func;
vector<Expr> sema;

Expand Down Expand Up @@ -342,6 +344,7 @@ class GenerateConsumerBody : public NoOpCollapsingMutator {
};

class CloneAcquire : public IRMutator {
protected:
using IRMutator::visit;

const string &old_name;
Expand Down Expand Up @@ -390,6 +393,7 @@ class CountConsumeNodes : public IRVisitor {
};

class ForkAsyncProducers : public IRMutator {
protected:
using IRMutator::visit;

const map<string, Function> &env;
Expand Down Expand Up @@ -493,6 +497,7 @@ class ForkAsyncProducers : public IRMutator {
// simple failure case, error_async_require_fail. One has not been
// written for the complex nested case yet.)
class InitializeSemaphores : public IRMutator {
protected:
using IRMutator::visit;

const Type sema_type = type_of<halide_semaphore_t *>();
Expand Down Expand Up @@ -558,6 +563,7 @@ class InitializeSemaphores : public IRMutator {
// A class to support stmt_uses_vars queries that repeatedly hit the same
// sub-stmts. Used to support TightenProducerConsumerNodes below.
class CachingStmtUsesVars : public IRMutator {
protected:
const Scope<> &query;
bool found_use = false;
std::map<Stmt, bool> cache;
Expand Down Expand Up @@ -613,6 +619,7 @@ class CachingStmtUsesVars : public IRMutator {

// Tighten the scope of consume nodes as much as possible to avoid needless synchronization.
class TightenProducerConsumerNodes : public IRMutator {
protected:
using IRMutator::visit;

Stmt make_producer_consumer(const string &name, bool is_producer, Stmt body, const Scope<> &scope, CachingStmtUsesVars &uses_vars) {
Expand Down Expand Up @@ -703,6 +710,7 @@ class TightenProducerConsumerNodes : public IRMutator {

// Update indices to add ring buffer.
class UpdateIndices : public IRMutator {
protected:
using IRMutator::visit;

Stmt visit(const Provide *op) override {
Expand Down Expand Up @@ -734,6 +742,7 @@ class UpdateIndices : public IRMutator {

// Inject ring buffering.
class InjectRingBuffering : public IRMutator {
protected:
using IRMutator::visit;

struct Loop {
Expand Down Expand Up @@ -816,6 +825,7 @@ class InjectRingBuffering : public IRMutator {
// Broaden the scope of acquire nodes to pack trailing work into the
// same task and to potentially reduce the nesting depth of tasks.
class ExpandAcquireNodes : public IRMutator {
protected:
using IRMutator::visit;

Stmt visit(const Block *op) override {
Expand Down Expand Up @@ -918,6 +928,7 @@ class ExpandAcquireNodes : public IRMutator {
};

class TightenForkNodes : public IRMutator {
protected:
using IRMutator::visit;

Stmt make_fork(const Stmt &first, const Stmt &rest) {
Expand Down Expand Up @@ -1005,12 +1016,13 @@ class TightenForkNodes : public IRMutator {
} // namespace

Stmt fork_async_producers(Stmt s, const map<string, Function> &env) {
s = TightenProducerConsumerNodes(env).mutate(s);
s = InjectRingBuffering(env).mutate(s);
s = ForkAsyncProducers(env).mutate(s);
s = ExpandAcquireNodes().mutate(s);
s = TightenForkNodes().mutate(s);
s = InitializeSemaphores().mutate(s);
ZoneScoped;
s = Profiled<TightenProducerConsumerNodes>(env).profiled_mutate(s);
s = Profiled<InjectRingBuffering>(env).profiled_mutate(s);
s = Profiled<ForkAsyncProducers>(env).profiled_mutate(s);
s = Profiled<ExpandAcquireNodes>().profiled_mutate(s);
s = Profiled<TightenForkNodes>().profiled_mutate(s);
s = Profiled<InitializeSemaphores>().profiled_mutate(s);
return s;
}

Expand Down
1 change: 1 addition & 0 deletions src/BoundSmallAllocations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ class BoundSmallAllocations : public IRMutator {
} // namespace

Stmt bound_small_allocations(const Stmt &s) {
ZoneScoped;
return BoundSmallAllocations().mutate(s);
}

Expand Down
8 changes: 8 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ target_sources(
CodeGen_Vulkan_Dev.h
CodeGen_WebGPU_Dev.h
CompilerLogger.h
CompilerProfiling.h
ConciseCasts.h
CPlusPlusMangle.h
ConstantBounds.h
Expand Down Expand Up @@ -166,6 +167,7 @@ target_sources(
Param.h
Parameter.h
PartitionLoops.h
PerformanceCounter.h
Pipeline.h
Prefetch.h
PrefetchDirective.h
Expand Down Expand Up @@ -269,6 +271,7 @@ target_sources(
CodeGen_WebGPU_Dev.cpp
CodeGen_X86.cpp
CompilerLogger.cpp
CompilerProfiling.cpp
CPlusPlusMangle.cpp
ConstantBounds.cpp
ConstantInterval.cpp
Expand Down Expand Up @@ -515,6 +518,11 @@ target_compile_definitions(Halide PRIVATE WITH_SPIRV)
target_compile_definitions(Halide PRIVATE WITH_VULKAN)
target_compile_definitions(Halide PRIVATE WITH_WEBGPU)

if (WITH_COMPILER_PROFILING)
target_compile_definitions(Halide PRIVATE WITH_COMPILER_PROFILING)
endif()


##
# Flatbuffers and Serialization dependencies.
##
Expand Down
32 changes: 21 additions & 11 deletions src/CSE.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <map>

#include "CSE.h"
#include "CompilerProfiling.h"
#include "IREquality.h"
#include "IRMutator.h"
#include "IROperator.h"
Expand Down Expand Up @@ -186,6 +187,7 @@ class Replacer : public IRGraphMutator {
};

class RemoveLets : public IRGraphMutator {
protected:
using IRGraphMutator::visit;

Scope<Expr> scope;
Expand Down Expand Up @@ -218,6 +220,7 @@ class RemoveLets : public IRGraphMutator {
};

class CSEEveryExprInStmt : public IRMutator {
protected:
bool lift_all;
using IRMutator::visit;

Expand Down Expand Up @@ -260,6 +263,7 @@ class CSEEveryExprInStmt : public IRMutator {
} // namespace

Expr common_subexpression_elimination(const Expr &e_in, bool lift_all) {
ZoneScoped;
Expr e = e_in;

// Early-out for trivial cases.
Expand All @@ -269,14 +273,15 @@ Expr common_subexpression_elimination(const Expr &e_in, bool lift_all) {

debug(4) << "\n\n\nInput to CSE " << e << "\n";

e = RemoveLets().mutate(e);
e = Profiled<RemoveLets>().profiled_mutate(e);

debug(4) << "After removing lets: " << e << "\n";

// CSE is run on unsanitized Exprs from the user, and may contain Vars with
// the same name as the temporaries we intend to introduce. Find any such
// Vars so that we know not to use those names.
class UniqueNameProvider : public IRGraphVisitor {
protected:
using IRGraphVisitor::visit;

const char prefix = 't'; // Annoyingly, this can't be static because this is a local class.
Expand All @@ -303,14 +308,18 @@ Expr common_subexpression_elimination(const Expr &e_in, bool lift_all) {
} while (vars.count(name));
return name;
}
} namer;
e.accept(&namer);
};
Profiled<UniqueNameProvider> namer;
{
ZoneScopedN("UniqueNameProvider");
e.accept(&namer);
}

GVN gvn;
e = gvn.mutate(e);
Profiled<GVN> gvn;
e = gvn.profiled_mutate(e);

ComputeUseCounts count_uses(gvn, lift_all);
count_uses.include(e);
Profiled<ComputeUseCounts> count_uses(gvn, lift_all);
count_uses.profiled_include(e);

debug(4) << "Canonical form without lets " << e << "\n";

Expand All @@ -330,8 +339,8 @@ Expr common_subexpression_elimination(const Expr &e_in, bool lift_all) {
}

// Rebuild the expr to include references to the variables:
Replacer replacer(replacements);
e = replacer.mutate(e);
Profiled<Replacer> replacer(replacements);
e = replacer.profiled_mutate(e);

debug(4) << "With variables " << e << "\n";

Expand All @@ -340,7 +349,7 @@ Expr common_subexpression_elimination(const Expr &e_in, bool lift_all) {
// Drop this variable as an acceptable replacement for this expr.
replacer.erase(value);
// Use containing lets in the value.
e = Let::make(var, replacer.mutate(value), e);
e = Let::make(var, replacer.profiled_mutate(value), e);
}

debug(4) << "With lets: " << e << "\n";
Expand All @@ -349,7 +358,8 @@ Expr common_subexpression_elimination(const Expr &e_in, bool lift_all) {
}

Stmt common_subexpression_elimination(const Stmt &s, bool lift_all) {
return CSEEveryExprInStmt(lift_all).mutate(s);
ZoneScoped;
return Profiled<CSEEveryExprInStmt>(lift_all).profiled_mutate(s);
}

// Testing code.
Expand Down
2 changes: 1 addition & 1 deletion src/CodeGen_ARM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2551,7 +2551,7 @@ bool CodeGen_ARM::supports_call_as_float16(const Call *op) const {
} // namespace

std::unique_ptr<CodeGen_Posix> new_CodeGen_ARM(const Target &target) {
return std::make_unique<CodeGen_ARM>(target);
return std::make_unique<Profiled<CodeGen_ARM>>(target);
}

#else // WITH_ARM || WITH_AARCH64
Expand Down
1 change: 1 addition & 0 deletions src/CodeGen_C.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ class TypeInfoGatherer : public IRGraphVisitor {

CodeGen_C::CodeGen_C(ostream &s, const Target &t, OutputKind output_kind, const std::string &guard)
: IRPrinter(s), id("$$ BAD ID $$"), target(t), output_kind(output_kind) {
ZoneScoped;

if (output_kind == CPlusPlusFunctionInfoHeader) {
// If it's a header, emit an include guard.
Expand Down
2 changes: 1 addition & 1 deletion src/CodeGen_Hexagon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2295,7 +2295,7 @@ void CodeGen_Hexagon::visit(const Allocate *alloc) {
} // namespace

std::unique_ptr<CodeGen_Posix> new_CodeGen_Hexagon(const Target &target) {
return std::make_unique<CodeGen_Hexagon>(target);
return std::make_unique<Profiled<CodeGen_Hexagon>>(target);
}

#else // WITH_HEXAGON
Expand Down
Loading
Loading