From 069dfacab2c7067c213898e790ae771c93474cc8 Mon Sep 17 00:00:00 2001 From: Hideto Ueno Date: Sun, 12 Apr 2026 22:44:54 -0700 Subject: [PATCH 1/3] [Synth][CutRewriter] Add timing-preserving area-flow reselection --- .../Dialect/Synth/Transforms/CutRewriter.h | 79 ++++- lib/Dialect/Synth/Transforms/CutRewriter.cpp | 308 +++++++++++++++--- test/Dialect/Synth/lut-mapper.mlir | 19 +- test/Dialect/Synth/tech-mapper.mlir | 23 ++ 4 files changed, 384 insertions(+), 45 deletions(-) diff --git a/include/circt/Dialect/Synth/Transforms/CutRewriter.h b/include/circt/Dialect/Synth/Transforms/CutRewriter.h index d613bf58fafd..1c4b5ee447a8 100644 --- a/include/circt/Dialect/Synth/Transforms/CutRewriter.h +++ b/include/circt/Dialect/Synth/Transforms/CutRewriter.h @@ -28,6 +28,7 @@ #include "llvm/Support/Allocator.h" #include "llvm/Support/LogicalResult.h" #include "llvm/Support/raw_ostream.h" +#include #include #include #include @@ -131,6 +132,12 @@ struct LogicNetworkGate { /// inversion bit is encoded in each edge. Signal edges[3]; + /// Number of uses by logic gates in this network. + unsigned logicFanoutCount = 0; + + /// Number of uses outside the logic network. + unsigned externalUseCount = 0; + LogicNetworkGate() : opAndKind(nullptr, Constant), edges{} {} LogicNetworkGate(Operation *op, Kind kind, llvm::ArrayRef operands = {}) @@ -171,11 +178,18 @@ struct LogicNetworkGate { return k == And2 || k == Xor2 || k == Maj3 || k == Identity || k == Choice; } - /// Check if this should always be a cut input (PI or constant). - bool isAlwaysCutInput() const { + /// Check if this gate is a cut leaf (PI or constant). + bool isCutLeaf() const { Kind k = getKind(); return k == PrimaryInput || k == Constant; } + + unsigned getTotalRefCount() const { + unsigned refCount = logicFanoutCount + externalUseCount; + return refCount == 0 ? 1 : refCount; + } + + bool isPrimaryOutput() const { return externalUseCount != 0; } }; /// Flat logic network representation for efficient cut enumeration. @@ -258,6 +272,16 @@ class LogicNetwork { /// Get the total number of nodes in the network. size_t size() const { return gates.size(); } + /// Get the total reference count used by area-flow estimation. + unsigned getTotalRefCount(uint32_t index) const { + return gates[index].getTotalRefCount(); + } + + /// Check if a node is observed outside the logic network. + bool isPrimaryOutput(uint32_t index) const { + return gates[index].isPrimaryOutput(); + } + /// Add a primary input to the network. uint32_t addPrimaryInput(Value value); @@ -279,6 +303,9 @@ class LogicNetwork { void clear(); private: + void recordLogicUse(uint32_t index) { ++gates[index].logicFanoutCount; } + void recordExternalUse(uint32_t index) { ++gates[index].externalUseCount; } + /// Map from MLIR Value to network index. llvm::DenseMap valueToIndex; @@ -349,8 +376,10 @@ class MatchedPattern { private: const CutRewritePattern *pattern = nullptr; ///< The matched library pattern SmallVector - arrivalTimes; ///< Arrival times of outputs from this pattern - double area = 0.0; ///< Area cost of this pattern + arrivalTimes; ///< Arrival times of outputs from this pattern + /// Saved match data we reuse during area-flow reselection. + MatchResult matchResult; + SmallVector patternInputToCutInput; public: /// Default constructor creates an invalid matched pattern. @@ -358,18 +387,38 @@ class MatchedPattern { /// Constructor for a valid matched pattern. MatchedPattern(const CutRewritePattern *pattern, - SmallVector arrivalTimes, double area) - : pattern(pattern), arrivalTimes(std::move(arrivalTimes)), area(area) {} + SmallVector arrivalTimes, + MatchResult matchResult, + ArrayRef patternInputToCutInput) + : pattern(pattern), arrivalTimes(std::move(arrivalTimes)), + matchResult(std::move(matchResult)), + patternInputToCutInput(patternInputToCutInput.begin(), + patternInputToCutInput.end()) {} /// Get the arrival time of signals through this pattern. DelayType getArrivalTime(unsigned outputIndex) const; ArrayRef getArrivalTimes() const; + DelayType getWorstOutputArrivalTime() const; /// Get the library pattern that was matched. const CutRewritePattern *getPattern() const; /// Get the area cost of using this pattern. double getArea() const; + + /// Get the per-input delays used when scoring this match. + ArrayRef getDelays() const; + + /// Get the cached match payload used to rebuild this match. + const MatchResult &getMatchResult() const { return matchResult; } + + /// Get the mapping from pattern input indices to cut input indices. + ArrayRef getInputPermutation() const { + return patternInputToCutInput; + } + + /// Get the delay for a cut input after accounting for input permutation. + DelayType getDelayForCutInput(unsigned cutInputIndex) const; }; /// Represents a cut in the combinational logic network. @@ -529,6 +578,15 @@ class CutSet { bool isFrozen = false; ///< Whether cut set is finalized public: + /// Latest time this node is allowed to arrive. + DelayType requiredTime = std::numeric_limits::max(); + + /// Arrival time of the currently selected cut. + DelayType bestArrivalTime = 0; + + /// Current area-flow score for the selected cut. + double areaFlow = 0.0; + /// Check if this cut set has a valid matched pattern. bool isMatched() const { return bestCut; } @@ -551,6 +609,9 @@ class CutSet { /// Get read-only access to all cuts in this set. ArrayRef getCuts() const; + + /// Replace the currently selected cut during area recovery. + void setBestCut(Cut *cut) { bestCut = cut; } }; /// Configuration options for the cut-based rewriting algorithm. @@ -658,6 +719,12 @@ class CutEnumerator { void dump() const; + /// Compute required times from the current timing-feasible seed mapping. + void computeRequiredTimes(); + + /// Re-select cuts using area-flow while preserving required times. + void reselectCutsForAreaFlow(); + /// Get cut sets (indexed by LogicNetwork index). const llvm::DenseMap &getCutSets() const { return cutSets; diff --git a/lib/Dialect/Synth/Transforms/CutRewriter.cpp b/lib/Dialect/Synth/Transforms/CutRewriter.cpp index 4bc525f5250f..0bc5e22a1281 100644 --- a/lib/Dialect/Synth/Transforms/CutRewriter.cpp +++ b/lib/Dialect/Synth/Transforms/CutRewriter.cpp @@ -26,6 +26,7 @@ #include "circt/Dialect/HW/HWOps.h" #include "circt/Dialect/Synth/SynthOpInterfaces.h" #include "circt/Dialect/Synth/SynthOps.h" +#include "circt/Dialect/Synth/Transforms/SynthPasses.h" #include "circt/Support/LLVM.h" #include "circt/Support/TruthTable.h" #include "circt/Support/UnusedOpPruner.h" @@ -51,6 +52,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/LogicalResult.h" #include +#include #include #include #include @@ -221,6 +223,30 @@ LogicalResult LogicNetwork::buildFromBlock(Block *block) { return result; } + auto isInternalLogicUser = [&](Operation *user) { + if (user->getNumResults() != 1) + return false; + Value result = user->getResult(0); + if (!hasIndex(result)) + return false; + return getGate(getIndex(result)).isLogicGate(); + }; + + // Note: Iteration over DenseMap is safe here since the order doesn't affect + // the results. + for (auto &[value, index] : valueToIndex) { + if (index == kConstant0 || index == kConstant1) + continue; + + // Record both internal fanout and external observation for area-flow. + for (OpOperand &use : value.getUses()) { + if (isInternalLogicUser(use.getOwner())) + recordLogicUse(index); + else + recordExternalUse(index); + } + } + return success(); } @@ -240,10 +266,19 @@ void LogicNetwork::clear() { // Helper functions //===----------------------------------------------------------------------===// -// Return true if the gate at the given index is always a cut input. -static bool isAlwaysCutInput(const LogicNetwork &network, uint32_t index) { +// Return true if the gate at the given index must remain a cut leaf. +static bool isCutLeaf(const LogicNetwork &network, uint32_t index) { const auto &gate = network.getGate(index); - return gate.isAlwaysCutInput(); + return gate.isCutLeaf(); +} + +// Return this node's cut set, unless it is a leaf or we never built one. +static CutSet *getNonLeafCutSet(llvm::DenseMap &cutSets, + const LogicNetwork &network, uint32_t index) { + if (isCutLeaf(network, index)) + return nullptr; + auto it = cutSets.find(index); + return it == cutSets.end() ? nullptr : it->second; } // Return true if the new area/delay is better than the old area/delay in the @@ -258,6 +293,34 @@ static bool compareDelayAndArea(OptimizationStrategy strategy, double newArea, llvm_unreachable("Unknown mapping strategy"); } +static constexpr double kAreaComparisonEpsilon = 1e-9; + +static bool areEquivalent(double lhs, double rhs) { + return std::abs(lhs - rhs) < kAreaComparisonEpsilon; +} + +static SmallVector +computeOutputArrivalTimes(unsigned numOutputs, unsigned numInputs, + ArrayRef delays, + ArrayRef inputArrivalTimes, + ArrayRef inputPermutation = {}) { + assert(inputPermutation.empty() || inputPermutation.size() == numInputs); + SmallVector outputArrivalTimes; + outputArrivalTimes.reserve(numOutputs); + for (unsigned outputIndex = 0; outputIndex < numOutputs; ++outputIndex) { + DelayType outputArrivalTime = 0; + for (unsigned inputIndex = 0; inputIndex < numInputs; ++inputIndex) { + unsigned cutOriginalInput = + inputPermutation.empty() ? inputIndex : inputPermutation[inputIndex]; + outputArrivalTime = std::max( + outputArrivalTime, delays[outputIndex * numInputs + inputIndex] + + inputArrivalTimes[cutOriginalInput]); + } + outputArrivalTimes.push_back(outputArrivalTime); + } + return outputArrivalTimes; +} + LogicalResult circt::synth::topologicallySortLogicNetwork(Operation *topOp) { const auto isOperationReady = [](Value value, Operation *op) -> bool { // Topologically sort AIG ops and dataflow ops. Other operations @@ -381,7 +444,7 @@ Cut::getInputArrivalTimes(CutEnumerator &enumerator, // Compute arrival times for each input. for (auto inputIndex : inputs) { - if (isAlwaysCutInput(network, inputIndex)) { + if (isCutLeaf(network, inputIndex)) { // If the input is a primary input, it has no delay. results.push_back(0); continue; @@ -648,6 +711,13 @@ ArrayRef MatchedPattern::getArrivalTimes() const { return arrivalTimes; } +DelayType MatchedPattern::getWorstOutputArrivalTime() const { + assert(pattern && "Pattern must be set to get arrival time"); + return arrivalTimes.empty() + ? 0 + : *std::max_element(arrivalTimes.begin(), arrivalTimes.end()); +} + DelayType MatchedPattern::getArrivalTime(unsigned index) const { assert(pattern && "Pattern must be set to get arrival time"); return arrivalTimes[index]; @@ -660,7 +730,22 @@ const CutRewritePattern *MatchedPattern::getPattern() const { double MatchedPattern::getArea() const { assert(pattern && "Pattern must be set to get area"); - return area; + return matchResult.area; +} + +ArrayRef MatchedPattern::getDelays() const { + assert(pattern && "Pattern must be set to get delays"); + return matchResult.getDelays(); +} + +DelayType MatchedPattern::getDelayForCutInput(unsigned cutInputIndex) const { + assert(pattern && "Pattern must be set to get delays"); + for (auto [patternInputIndex, mappedCutInput] : + llvm::enumerate(patternInputToCutInput)) { + if (mappedCutInput == cutInputIndex) + return getDelays()[patternInputIndex]; + } + llvm_unreachable("cut input not found in matched permutation"); } //===----------------------------------------------------------------------===// @@ -782,7 +867,8 @@ void CutSet::finalize( std::stable_partition(cuts.begin(), cuts.end(), [](const Cut *cut) { return cut->isTrivialCut(); }); - auto isBetterCut = [&options](const Cut *a, const Cut *b) { + auto isBetterCut = [seedStrategy = options.strategy](const Cut *a, + const Cut *b) { assert(!a->isTrivialCut() && !b->isTrivialCut() && "Trivial cuts should have been excluded"); const auto &aMatched = a->getMatchedPattern(); @@ -790,7 +876,7 @@ void CutSet::finalize( if (aMatched && bMatched) return compareDelayAndArea( - options.strategy, aMatched->getArea(), aMatched->getArrivalTimes(), + seedStrategy, aMatched->getArea(), aMatched->getArrivalTimes(), bMatched->getArea(), bMatched->getArrivalTimes()); if (static_cast(aMatched) != static_cast(bMatched)) @@ -811,6 +897,8 @@ void CutSet::finalize( if (!currentMatch) continue; bestCut = cut; + bestArrivalTime = currentMatch->getWorstOutputArrivalTime(); + areaFlow = currentMatch->getArea(); break; } @@ -1224,6 +1312,152 @@ void CutEnumerator::dump() const { llvm::outs() << "Cut enumeration completed successfully\n"; } +void CutEnumerator::computeRequiredTimes() { + DelayType globalWorstArrival = 0; + SmallVector outputCutSets; + for (auto &[index, cutSet] : cutSets) { + if (!logicNetwork.isPrimaryOutput(index)) + continue; + + auto *bestCut = cutSet->getBestMatchedCut(); + if (!bestCut) + continue; + + globalWorstArrival = + std::max(globalWorstArrival, + bestCut->getMatchedPattern()->getWorstOutputArrivalTime()); + outputCutSets.push_back(cutSet); + } + + // There is no output. + if (outputCutSets.empty()) + return; + + // Seed outputs with the worst arrival from the current timing-feasible map. + for (auto *cutSet : outputCutSets) + cutSet->requiredTime = globalWorstArrival; + + for (auto it = processingOrder.rbegin(); it != processingOrder.rend(); ++it) { + auto cutSetIt = cutSets.find(*it); + if (cutSetIt == cutSets.end()) + continue; + + auto *cutSet = cutSetIt->second; + auto *bestCut = cutSet->getBestMatchedCut(); + if (!bestCut) + continue; + + for (auto [i, inputNodeIndex] : llvm::enumerate(bestCut->inputs)) { + auto *inputCutSet = + getNonLeafCutSet(cutSets, logicNetwork, inputNodeIndex); + if (!inputCutSet) + continue; + + DelayType inputRequired = + cutSet->requiredTime - + bestCut->getMatchedPattern()->getDelayForCutInput(i); + inputCutSet->requiredTime = + std::min(inputCutSet->requiredTime, inputRequired); + } + } +} + +// Pick cuts again using area-flow, while staying within the timing bound set +// by the current mapping. +void CutEnumerator::reselectCutsForAreaFlow() { + // Start from the arrival times of the cuts we already selected. + for (auto index : processingOrder) { + auto it = cutSets.find(index); + if (it == cutSets.end()) + continue; + + auto *bestCut = it->second->getBestMatchedCut(); + if (!bestCut) + continue; + + it->second->bestArrivalTime = + bestCut->getMatchedPattern()->getWorstOutputArrivalTime(); + } + + for (auto index : processingOrder) { + auto cutSetIt = cutSets.find(index); + if (cutSetIt == cutSets.end()) + continue; + + auto *cutSet = cutSetIt->second; + Cut *bestAreaFlowCut = nullptr; + std::optional bestAreaFlowMatch; + double bestFlow = std::numeric_limits::max(); + DelayType bestFlowArrival = std::numeric_limits::max(); + double bestLocalArea = std::numeric_limits::max(); + + for (Cut *cut : cutSet->getCuts()) { + // Only cuts we already know how to implement can be reconsidered here. + const auto &candidateMatch = cut->getMatchedPattern(); + if (!candidateMatch) + continue; + + SmallVector inputArrivalTimes; + inputArrivalTimes.reserve(cut->getInputSize()); + for (uint32_t inputIndex : cut->inputs) { + DelayType inputArrival = 0; + if (auto *inputCutSet = + getNonLeafCutSet(cutSets, logicNetwork, inputIndex)) + inputArrival = inputCutSet->bestArrivalTime; + inputArrivalTimes.push_back(inputArrival); + } + + // Recompute this cut's timing from the fanins we currently picked. + auto outputArrivalTimes = computeOutputArrivalTimes( + cut->getOutputSize(logicNetwork), cut->getInputSize(), + candidateMatch->getDelays(), inputArrivalTimes, + candidateMatch->getInputPermutation()); + DelayType arrivalTime = *std::max_element(outputArrivalTimes.begin(), + outputArrivalTimes.end()); + // Do not spend area if it would break the current timing bound. + if (arrivalTime > cutSet->requiredTime) + continue; + + // Count this cut's own area plus its share of the fanins it depends on. + double flow = candidateMatch->getArea(); + for (uint32_t inputIndex : cut->inputs) { + auto *inputCutSet = getNonLeafCutSet(cutSets, logicNetwork, inputIndex); + if (!inputCutSet) + continue; + + flow += + inputCutSet->areaFlow / logicNetwork.getTotalRefCount(inputIndex); + } + + // Break ties in a stable way: lower flow, then earlier timing, then + // lower local area. + if (flow < bestFlow || + (areEquivalent(flow, bestFlow) && arrivalTime < bestFlowArrival) || + (areEquivalent(flow, bestFlow) && arrivalTime == bestFlowArrival && + candidateMatch->getArea() < bestLocalArea)) { + bestFlow = flow; + bestFlowArrival = arrivalTime; + bestLocalArea = candidateMatch->getArea(); + bestAreaFlowCut = cut; + bestAreaFlowMatch = MatchedPattern( + candidateMatch->getPattern(), std::move(outputArrivalTimes), + candidateMatch->getMatchResult(), + candidateMatch->getInputPermutation()); + } + } + + if (!bestAreaFlowCut || !bestAreaFlowMatch) + continue; + + // Later nodes should see the timing and flow of the cut we picked here. + bestAreaFlowCut->setMatchedPattern(std::move(*bestAreaFlowMatch)); + cutSet->setBestCut(bestAreaFlowCut); + cutSet->areaFlow = bestFlow; + cutSet->bestArrivalTime = + bestAreaFlowCut->getMatchedPattern()->getWorstOutputArrivalTime(); + } +} + //===----------------------------------------------------------------------===// // CutRewriter //===----------------------------------------------------------------------===// @@ -1265,6 +1499,13 @@ LogicalResult CutRewriter::run(Operation *topOp) { return success(); } + // Run area-flow based reselection. + // TODO: This selection must be controlled by the strategy option, but + // currently it runs area recovery unconditionally since it improves area + // regardless of the strategy. + cutEnumerator.computeRequiredTimes(); + cutEnumerator.reselectCutsForAreaFlow(); + // Select best cuts and perform mapping if (failed(runBottomUpRewrite(topOp))) return failure(); @@ -1303,41 +1544,29 @@ std::optional CutRewriter::patternMatchCut(const Cut &cut) { const CutRewritePattern *bestPattern = nullptr; SmallVector inputArrivalTimes; SmallVector bestArrivalTimes; - double bestArea = 0.0; + SmallVector bestInputPermutation; + std::optional bestMatchResult; inputArrivalTimes.reserve(cut.getInputSize()); bestArrivalTimes.reserve(cut.getOutputSize(network)); + SmallVector identityMapping(cut.getInputSize()); + for (auto [idx, mapped] : llvm::enumerate(identityMapping)) + mapped = idx; // Compute arrival times for each input. if (failed(cut.getInputArrivalTimes(cutEnumerator, inputArrivalTimes))) return {}; auto computeArrivalTimeAndPickBest = - [&](const CutRewritePattern *pattern, const MatchResult &matchResult, - llvm::function_ref mapIndex) { - SmallVector outputArrivalTimes; - // Compute the maximum delay for each output from inputs. - for (unsigned outputIndex = 0, outputSize = cut.getOutputSize(network); - outputIndex < outputSize; ++outputIndex) { - // Compute the arrival time for this output. - DelayType outputArrivalTime = 0; - auto delays = matchResult.getDelays(); - for (unsigned inputIndex = 0, inputSize = cut.getInputSize(); - inputIndex < inputSize; ++inputIndex) { - // Map pattern input i to cut input through NPN transformations - unsigned cutOriginalInput = mapIndex(inputIndex); - outputArrivalTime = - std::max(outputArrivalTime, - delays[outputIndex * inputSize + inputIndex] + - inputArrivalTimes[cutOriginalInput]); - } - - outputArrivalTimes.push_back(outputArrivalTime); - } + [&](const CutRewritePattern *pattern, MatchResult matchResult, + ArrayRef patternInputToCutInput) { + auto outputArrivalTimes = computeOutputArrivalTimes( + cut.getOutputSize(network), cut.getInputSize(), + matchResult.getDelays(), inputArrivalTimes, patternInputToCutInput); // Update the arrival time if (!bestPattern || compareDelayAndArea(options.strategy, matchResult.area, - outputArrivalTimes, bestArea, + outputArrivalTimes, bestMatchResult->area, bestArrivalTimes)) { LLVM_DEBUG({ llvm::dbgs() << "== Matched Pattern ==============\n"; @@ -1359,9 +1588,11 @@ std::optional CutRewriter::patternMatchCut(const Cut &cut) { llvm::dbgs() << "== Matched Pattern End ==============\n"; }); - bestArrivalTimes = std::move(outputArrivalTimes); - bestArea = matchResult.area; + bestArrivalTimes = outputArrivalTimes; + bestInputPermutation.assign(patternInputToCutInput.begin(), + patternInputToCutInput.end()); bestPattern = pattern; + bestMatchResult = std::move(matchResult); } }; @@ -1376,20 +1607,21 @@ std::optional CutRewriter::patternMatchCut(const Cut &cut) { // Get the input mapping from pattern's NPN class to cut's NPN class SmallVector inputMapping; cutNPN.getInputPermutation(patternNPN, inputMapping); - computeArrivalTimeAndPickBest(pattern, *matchResult, - [&](unsigned i) { return inputMapping[i]; }); + computeArrivalTimeAndPickBest(pattern, std::move(*matchResult), + inputMapping); } for (const CutRewritePattern *pattern : patterns.nonNPNPatterns) { if (auto matchResult = pattern->match(cutEnumerator, cut)) - computeArrivalTimeAndPickBest(pattern, *matchResult, - [&](unsigned i) { return i; }); + computeArrivalTimeAndPickBest(pattern, std::move(*matchResult), + identityMapping); } if (!bestPattern) return {}; // No matching pattern found - return MatchedPattern(bestPattern, std::move(bestArrivalTimes), bestArea); + return MatchedPattern(bestPattern, std::move(bestArrivalTimes), + std::move(*bestMatchResult), bestInputPermutation); } LogicalResult CutRewriter::runBottomUpRewrite(Operation *top) { @@ -1417,7 +1649,7 @@ LogicalResult CutRewriter::runBottomUpRewrite(Operation *top) { continue; } - if (isAlwaysCutInput(network, index)) { + if (isCutLeaf(network, index)) { // If the value is a primary input, skip it LLVM_DEBUG(llvm::dbgs() << "Skipping inputs: " << value << "\n"); continue; diff --git a/test/Dialect/Synth/lut-mapper.mlir b/test/Dialect/Synth/lut-mapper.mlir index 9475a3e3137e..ffa963dee22f 100644 --- a/test/Dialect/Synth/lut-mapper.mlir +++ b/test/Dialect/Synth/lut-mapper.mlir @@ -1,6 +1,7 @@ // FIXME: max-cuts-per-root=20 is due to a lack of non-minimal cut filtering. // RUN: circt-opt --pass-pipeline='builtin.module(hw.module(synth-generic-lut-mapper{test=true max-cuts-per-root=20}))' %s | FileCheck %s --check-prefixes CHECK,LUT -// RUN: circt-opt --pass-pipeline='builtin.module(hw.module(synth-generic-lut-mapper{test=true max-lut-size=2}))' %s | FileCheck %s --check-prefixes CHECK,LUT2 +// RUN: circt-opt --pass-pipeline='builtin.module(hw.module(synth-generic-lut-mapper{test=true max-lut-size=2 strategy=timing}))' %s | FileCheck %s --check-prefixes CHECK,LUT2 +// RUN: circt-opt --pass-pipeline='builtin.module(hw.module(synth-generic-lut-mapper{test=true max-lut-size=2 strategy=area}))' %s | FileCheck %s --check-prefixes CHECK,LUT2 // CHECK: %[[B_0:.+]] = comb.extract %b from 0 : (i2) -> i1 // CHECK-NEXT: %[[B_1:.+]] = comb.extract %b from 1 : (i2) -> i1 @@ -68,3 +69,19 @@ hw.module @choice_slow_branch(in %a : i1, in %b : i1, in %c : i1, %choice = synth.choice %slow, %fast : i1 hw.output %choice : i1 } + +// CHECK-LABEL: hw.module @choice_shared_area_flow +// LUT2-NEXT: %[[AB:.+]] = comb.truth_table %b, %a -> [false, false, false, true] +// LUT2-SAME: test.arrival_times = [1] +// LUT2-NEXT: %[[OUT:.+]] = comb.truth_table %[[AB]], %c -> [false, false, false, true] +// LUT2-SAME: test.arrival_times = [2] +// LUT2-NEXT: hw.output %[[AB]], %[[OUT]] : i1, i1 +hw.module @choice_shared_area_flow(in %a : i1, in %b : i1, in %c : i1, + out share : i1, out y : i1) { + %ab = synth.aig.and_inv %a, %b : i1 + %left = synth.aig.and_inv %ab, %c : i1 + %bc = synth.aig.and_inv %b, %c : i1 + %right = synth.aig.and_inv %a, %bc : i1 + %choice = synth.choice %right, %left : i1 + hw.output %ab, %choice : i1, i1 +} diff --git a/test/Dialect/Synth/tech-mapper.mlir b/test/Dialect/Synth/tech-mapper.mlir index 073423c43c94..e293419131b3 100644 --- a/test/Dialect/Synth/tech-mapper.mlir +++ b/test/Dialect/Synth/tech-mapper.mlir @@ -96,6 +96,29 @@ hw.module @area_flow_test(in %a : i1, in %b : i1, in %c: i1, out result : i1) { hw.output %1 : i1 } +// One output arrives later and gives the other output enough slack to switch +// to a cheaper implementation during recovery. +// CHECK-LABEL: @shared_slack_allows_area_recovery +hw.module @shared_slack_allows_area_recovery(in %a : i1, in %b : i1, + in %c : i1, in %d : i1, + in %e : i1, + out slow : i1, out cheap : i1) { + // The first output is deep enough to arrive at 3, so recovery should switch the second output to the cheaper 2-stage implementation. + // Without recovery @and_inv_3 would be used for the second output. + // TIMING: {test.arrival_times = [3]} + // TIMING-NEXT: %[[TCHEAP0:.+]] = hw.instance "{{[a-zA-Z0-9_]+}}" @and_inv(a: %a: i1, b: %b: i1) -> (result: i1) {test.arrival_times = [1]} + // TIMING-NEXT: %[[TCHEAP1:.+]] = hw.instance "{{[a-zA-Z0-9_]+}}" @and_inv_n(a: %[[TCHEAP0]]: i1, b: %c: i1) -> (result: i1) {test.arrival_times = [2]} + // TIMING-NEXT: hw.output %{{.+}}, %[[TCHEAP1]] : i1, i1 + %slow0 = synth.aig.and_inv %a, %b : i1 + %slow1 = synth.aig.and_inv %c, not %slow0 : i1 + %slow2 = synth.aig.and_inv %d, not %slow1 : i1 + %slow3 = synth.aig.and_inv %e, not %slow2 : i1 + + %cheap0 = synth.aig.and_inv %a, %b : i1 + %cheap1 = synth.aig.and_inv %c, not %cheap0 : i1 + hw.output %slow3, %cheap1 : i1, i1 +} + // Test primary inputs handling // CHECK-LABEL: @primary_inputs_test hw.module @primary_inputs_test(in %a : i1, in %b : i1, out result : i1) { From fcf1d2beb6ec65c3941d5ed698b76c496bfebe7c Mon Sep 17 00:00:00 2001 From: Hideto Ueno Date: Thu, 16 Apr 2026 21:35:59 -0700 Subject: [PATCH 2/3] [Synth][CutRewriter] Use selected-cover fanout for area-flow --- .../Dialect/Synth/Transforms/CutRewriter.h | 21 ++++------ lib/Dialect/Synth/Transforms/CutRewriter.cpp | 39 ++++++++++++++++--- .../Synth/tech-mapper-area-flow-fanout.mlir | 39 +++++++++++++++++++ 3 files changed, 79 insertions(+), 20 deletions(-) create mode 100644 test/Dialect/Synth/tech-mapper-area-flow-fanout.mlir diff --git a/include/circt/Dialect/Synth/Transforms/CutRewriter.h b/include/circt/Dialect/Synth/Transforms/CutRewriter.h index 1c4b5ee447a8..c45f25d35d7f 100644 --- a/include/circt/Dialect/Synth/Transforms/CutRewriter.h +++ b/include/circt/Dialect/Synth/Transforms/CutRewriter.h @@ -132,9 +132,6 @@ struct LogicNetworkGate { /// inversion bit is encoded in each edge. Signal edges[3]; - /// Number of uses by logic gates in this network. - unsigned logicFanoutCount = 0; - /// Number of uses outside the logic network. unsigned externalUseCount = 0; @@ -184,12 +181,9 @@ struct LogicNetworkGate { return k == PrimaryInput || k == Constant; } - unsigned getTotalRefCount() const { - unsigned refCount = logicFanoutCount + externalUseCount; - return refCount == 0 ? 1 : refCount; - } - bool isPrimaryOutput() const { return externalUseCount != 0; } + + unsigned getExternalUseCount() const { return externalUseCount; } }; /// Flat logic network representation for efficient cut enumeration. @@ -272,16 +266,16 @@ class LogicNetwork { /// Get the total number of nodes in the network. size_t size() const { return gates.size(); } - /// Get the total reference count used by area-flow estimation. - unsigned getTotalRefCount(uint32_t index) const { - return gates[index].getTotalRefCount(); - } - /// Check if a node is observed outside the logic network. bool isPrimaryOutput(uint32_t index) const { return gates[index].isPrimaryOutput(); } + /// Get the number of uses outside the logic network. + unsigned getExternalUseCount(uint32_t index) const { + return gates[index].getExternalUseCount(); + } + /// Add a primary input to the network. uint32_t addPrimaryInput(Value value); @@ -303,7 +297,6 @@ class LogicNetwork { void clear(); private: - void recordLogicUse(uint32_t index) { ++gates[index].logicFanoutCount; } void recordExternalUse(uint32_t index) { ++gates[index].externalUseCount; } /// Map from MLIR Value to network index. diff --git a/lib/Dialect/Synth/Transforms/CutRewriter.cpp b/lib/Dialect/Synth/Transforms/CutRewriter.cpp index 0bc5e22a1281..aa6921c20bf6 100644 --- a/lib/Dialect/Synth/Transforms/CutRewriter.cpp +++ b/lib/Dialect/Synth/Transforms/CutRewriter.cpp @@ -238,11 +238,9 @@ LogicalResult LogicNetwork::buildFromBlock(Block *block) { if (index == kConstant0 || index == kConstant1) continue; - // Record both internal fanout and external observation for area-flow. + // Record observations that remain visible after cut covering. for (OpOperand &use : value.getUses()) { - if (isInternalLogicUser(use.getOwner())) - recordLogicUse(index); - else + if (!isInternalLogicUser(use.getOwner())) recordExternalUse(index); } } @@ -1365,6 +1363,27 @@ void CutEnumerator::computeRequiredTimes() { // Pick cuts again using area-flow, while staying within the timing bound set // by the current mapping. void CutEnumerator::reselectCutsForAreaFlow() { + SmallVector selectedRefCounts(logicNetwork.size(), 0); + for (auto [index, gate] : llvm::enumerate(logicNetwork.getGates())) + selectedRefCounts[index] = gate.getExternalUseCount(); + + auto addSelectedCutRefs = [&](const Cut *cut) { + if (!cut) + return; + for (uint32_t inputIndex : cut->inputs) + ++selectedRefCounts[inputIndex]; + }; + + auto dropSelectedCutRefs = [&](const Cut *cut) { + if (!cut) + return; + for (uint32_t inputIndex : cut->inputs) { + assert(selectedRefCounts[inputIndex] != 0 && + "selected reference count underflow"); + --selectedRefCounts[inputIndex]; + } + }; + // Start from the arrival times of the cuts we already selected. for (auto index : processingOrder) { auto it = cutSets.find(index); @@ -1377,6 +1396,7 @@ void CutEnumerator::reselectCutsForAreaFlow() { it->second->bestArrivalTime = bestCut->getMatchedPattern()->getWorstOutputArrivalTime(); + addSelectedCutRefs(bestCut); } for (auto index : processingOrder) { @@ -1385,6 +1405,7 @@ void CutEnumerator::reselectCutsForAreaFlow() { continue; auto *cutSet = cutSetIt->second; + Cut *currentBestCut = cutSet->getBestMatchedCut(); Cut *bestAreaFlowCut = nullptr; std::optional bestAreaFlowMatch; double bestFlow = std::numeric_limits::max(); @@ -1425,8 +1446,12 @@ void CutEnumerator::reselectCutsForAreaFlow() { if (!inputCutSet) continue; - flow += - inputCutSet->areaFlow / logicNetwork.getTotalRefCount(inputIndex); + unsigned effectiveRefCount = selectedRefCounts[inputIndex]; + if (!currentBestCut || + !llvm::is_contained(currentBestCut->inputs, inputIndex)) + ++effectiveRefCount; + assert(effectiveRefCount != 0 && "cut inputs must be referenced"); + flow += inputCutSet->areaFlow / effectiveRefCount; } // Break ties in a stable way: lower flow, then earlier timing, then @@ -1450,6 +1475,8 @@ void CutEnumerator::reselectCutsForAreaFlow() { continue; // Later nodes should see the timing and flow of the cut we picked here. + dropSelectedCutRefs(currentBestCut); + addSelectedCutRefs(bestAreaFlowCut); bestAreaFlowCut->setMatchedPattern(std::move(*bestAreaFlowMatch)); cutSet->setBestCut(bestAreaFlowCut); cutSet->areaFlow = bestFlow; diff --git a/test/Dialect/Synth/tech-mapper-area-flow-fanout.mlir b/test/Dialect/Synth/tech-mapper-area-flow-fanout.mlir new file mode 100644 index 000000000000..5d4bd59d3f7d --- /dev/null +++ b/test/Dialect/Synth/tech-mapper-area-flow-fanout.mlir @@ -0,0 +1,39 @@ +// RUN: circt-opt --pass-pipeline='builtin.module(synth-tech-mapper{strategy=area test=true max-cuts-per-root=8})' %s | FileCheck %s --check-prefixes CHECK,AREA + +hw.module @and_inv(in %a : i1, in %b : i1, out result : i1) attributes {hw.techlib.info = {area = 1.0 : f64, delay = [[1], [1]]}} { + %0 = synth.aig.and_inv %a, %b : i1 + hw.output %0 : i1 +} + +hw.module @and_inv_n(in %a : i1, in %b : i1, out result : i1) attributes {hw.techlib.info = {area = 1.0 : f64, delay = [[1], [1]]}} { + %0 = synth.aig.and_inv not %a, %b : i1 + hw.output %0 : i1 +} + +hw.module @and_inv_3_cheap(in %a : i1, in %b : i1, in %c : i1, out result : i1) attributes {hw.techlib.info = {area = 0.75 : f64, delay = [[1], [1], [1]]}} { + %0 = synth.aig.and_inv %a, %b : i1 + %1 = synth.aig.and_inv not %0, %c : i1 + hw.output %1 : i1 +} + +hw.module @and3_mid(in %a : i1, in %b : i1, in %c : i1, out result : i1) attributes {hw.techlib.info = {area = 1.75 : f64, delay = [[1], [1], [1]]}} { + %0 = synth.aig.and_inv %a, %b : i1 + %1 = synth.aig.and_inv %0, %c : i1 + hw.output %1 : i1 +} + +// Make sure area-flow uses the current cut cover's fanout, not the original +// AIG fanout. The first output switches away from %ab immediately, so the +// second output must see %ab with a single mapped reference. +// CHECK-LABEL: @mapped_fanout_drives_area_flow +hw.module @mapped_fanout_drives_area_flow(in %a : i1, in %b : i1, in %c : i1, + in %d : i1, + out cheap : i1, out recovered : i1) { + // AREA: %[[CHEAP:.+]] = hw.instance "{{[a-zA-Z0-9_]+}}" @and_inv_3_cheap(a: %a: i1, b: %b: i1, c: %c: i1) -> (result: i1) {test.arrival_times = [1]} + // AREA-NEXT: %[[RECOVERED:.+]] = hw.instance "{{[a-zA-Z0-9_]+}}" @and3_mid(a: %a: i1, b: %b: i1, c: %d: i1) -> (result: i1) {test.arrival_times = [1]} + // AREA-NEXT: hw.output %[[CHEAP]], %[[RECOVERED]] : i1, i1 + %ab = synth.aig.and_inv %a, %b : i1 + %cheap = synth.aig.and_inv %c, not %ab : i1 + %recovered = synth.aig.and_inv %ab, %d : i1 + hw.output %cheap, %recovered : i1, i1 +} From e3f676e0de5583b4dd2ba6060b82cdb5d26b91d1 Mon Sep 17 00:00:00 2001 From: Hideto Ueno Date: Thu, 16 Apr 2026 21:44:39 -0700 Subject: [PATCH 3/3] [Synth][CutRewriter] Add exact-area cut reselection --- .../Dialect/Synth/Transforms/CutRewriter.h | 3 + lib/Dialect/Synth/Transforms/CutRewriter.cpp | 173 ++++++++++++++++++ 2 files changed, 176 insertions(+) diff --git a/include/circt/Dialect/Synth/Transforms/CutRewriter.h b/include/circt/Dialect/Synth/Transforms/CutRewriter.h index c45f25d35d7f..b672e3e7830a 100644 --- a/include/circt/Dialect/Synth/Transforms/CutRewriter.h +++ b/include/circt/Dialect/Synth/Transforms/CutRewriter.h @@ -718,6 +718,9 @@ class CutEnumerator { /// Re-select cuts using area-flow while preserving required times. void reselectCutsForAreaFlow(); + /// Re-select cuts using exact-area deref/ref while preserving required times. + void reselectCutsForExactArea(); + /// Get cut sets (indexed by LogicNetwork index). const llvm::DenseMap &getCutSets() const { return cutSets; diff --git a/lib/Dialect/Synth/Transforms/CutRewriter.cpp b/lib/Dialect/Synth/Transforms/CutRewriter.cpp index aa6921c20bf6..5ca3776ca469 100644 --- a/lib/Dialect/Synth/Transforms/CutRewriter.cpp +++ b/lib/Dialect/Synth/Transforms/CutRewriter.cpp @@ -1485,6 +1485,178 @@ void CutEnumerator::reselectCutsForAreaFlow() { } } +// Pick cuts again using exact-area deref/ref while staying within the timing +// bound set by the current mapping. +void CutEnumerator::reselectCutsForExactArea() { + SmallVector selectedRefCounts(logicNetwork.size(), 0); + for (auto [index, gate] : llvm::enumerate(logicNetwork.getGates())) + selectedRefCounts[index] = gate.getExternalUseCount(); + + auto referenceNode = [&](auto &&self, uint32_t index) -> double { + auto *cutSet = getNonLeafCutSet(cutSets, logicNetwork, index); + if (!cutSet) + return 0.0; + + if (selectedRefCounts[index]++ > 0) + return 0.0; + + auto *bestCut = cutSet->getBestMatchedCut(); + if (!bestCut) + return 0.0; + + double area = bestCut->getMatchedPattern()->getArea(); + for (uint32_t inputIndex : bestCut->inputs) + area += self(self, inputIndex); + return area; + }; + + auto dereferenceNode = [&](auto &&self, uint32_t index) -> double { + auto *cutSet = getNonLeafCutSet(cutSets, logicNetwork, index); + if (!cutSet) + return 0.0; + + assert(selectedRefCounts[index] != 0 && + "selected reference count underflow"); + if (--selectedRefCounts[index] > 0) + return 0.0; + + auto *bestCut = cutSet->getBestMatchedCut(); + if (!bestCut) + return 0.0; + + double area = bestCut->getMatchedPattern()->getArea(); + for (uint32_t inputIndex : bestCut->inputs) + area += self(self, inputIndex); + return area; + }; + + auto referenceCut = [&](Cut *cut) -> double { + if (!cut) + return 0.0; + double area = cut->getMatchedPattern()->getArea(); + for (uint32_t inputIndex : cut->inputs) + area += referenceNode(referenceNode, inputIndex); + return area; + }; + + auto dereferenceCut = [&](Cut *cut) -> double { + if (!cut) + return 0.0; + double area = cut->getMatchedPattern()->getArea(); + for (uint32_t inputIndex : cut->inputs) + area += dereferenceNode(dereferenceNode, inputIndex); + return area; + }; + + // Seed counts and arrival times from the current mapping. + for (auto index : processingOrder) { + auto it = cutSets.find(index); + if (it == cutSets.end()) + continue; + + auto *bestCut = it->second->getBestMatchedCut(); + if (!bestCut) + continue; + + it->second->bestArrivalTime = + bestCut->getMatchedPattern()->getWorstOutputArrivalTime(); + selectedRefCounts[index] = logicNetwork.getExternalUseCount(index); + } + for (auto index : processingOrder) { + if (selectedRefCounts[index] == 0) + continue; + + auto cutSetIt = cutSets.find(index); + if (cutSetIt == cutSets.end()) + continue; + + auto *bestCut = cutSetIt->second->getBestMatchedCut(); + if (!bestCut) + continue; + + (void)referenceCut(bestCut); + } + + for (auto index : processingOrder) { + auto cutSetIt = cutSets.find(index); + if (cutSetIt == cutSets.end()) + continue; + + auto *cutSet = cutSetIt->second; + Cut *currentBestCut = cutSet->getBestMatchedCut(); + if (!currentBestCut) + continue; + + bool isActive = selectedRefCounts[index] != 0; + if (isActive) + (void)dereferenceCut(currentBestCut); + + Cut *bestExactCut = nullptr; + std::optional bestExactMatch; + double bestExactArea = std::numeric_limits::max(); + DelayType bestExactArrival = std::numeric_limits::max(); + double bestLocalArea = std::numeric_limits::max(); + + for (Cut *cut : cutSet->getCuts()) { + const auto &candidateMatch = cut->getMatchedPattern(); + if (!candidateMatch) + continue; + + SmallVector inputArrivalTimes; + inputArrivalTimes.reserve(cut->getInputSize()); + for (uint32_t inputIndex : cut->inputs) { + DelayType inputArrival = 0; + if (auto *inputCutSet = + getNonLeafCutSet(cutSets, logicNetwork, inputIndex)) + inputArrival = inputCutSet->bestArrivalTime; + inputArrivalTimes.push_back(inputArrival); + } + + auto outputArrivalTimes = computeOutputArrivalTimes( + cut->getOutputSize(logicNetwork), cut->getInputSize(), + candidateMatch->getDelays(), inputArrivalTimes, + candidateMatch->getInputPermutation()); + DelayType arrivalTime = *std::max_element(outputArrivalTimes.begin(), + outputArrivalTimes.end()); + if (arrivalTime > cutSet->requiredTime) + continue; + + double exactArea = referenceCut(cut); + (void)dereferenceCut(cut); + + if (exactArea < bestExactArea || + (areEquivalent(exactArea, bestExactArea) && + arrivalTime < bestExactArrival) || + (areEquivalent(exactArea, bestExactArea) && + arrivalTime == bestExactArrival && + candidateMatch->getArea() < bestLocalArea)) { + bestExactArea = exactArea; + bestExactArrival = arrivalTime; + bestLocalArea = candidateMatch->getArea(); + bestExactCut = cut; + bestExactMatch = MatchedPattern(candidateMatch->getPattern(), + std::move(outputArrivalTimes), + candidateMatch->getMatchResult(), + candidateMatch->getInputPermutation()); + } + } + + if (!bestExactCut || !bestExactMatch) { + if (isActive) + (void)referenceCut(currentBestCut); + continue; + } + + bestExactCut->setMatchedPattern(std::move(*bestExactMatch)); + cutSet->setBestCut(bestExactCut); + cutSet->bestArrivalTime = + bestExactCut->getMatchedPattern()->getWorstOutputArrivalTime(); + + if (isActive) + (void)referenceCut(bestExactCut); + } +} + //===----------------------------------------------------------------------===// // CutRewriter //===----------------------------------------------------------------------===// @@ -1532,6 +1704,7 @@ LogicalResult CutRewriter::run(Operation *topOp) { // regardless of the strategy. cutEnumerator.computeRequiredTimes(); cutEnumerator.reselectCutsForAreaFlow(); + cutEnumerator.reselectCutsForExactArea(); // Select best cuts and perform mapping if (failed(runBottomUpRewrite(topOp)))