Skip to content

Commit

Permalink
speeding up caches for hfsort+
Browse files Browse the repository at this point in the history
Summary:
When running hfsort+, we invalidate too many cache entries, which leads to inefficiencies. It seems we only need to invalidate cache for pairs of clusters (Into, X) and (X, Into) when modifying cluster Into (for all clusters X).
With the modification, we do not really need ShortCache, since it is computed only once per pair of clusters.

(cherry picked from FBD6341039)
  • Loading branch information
spupyrev authored and memfrob committed Oct 4, 2022
1 parent 30cc9b6 commit c588b5e
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 94 deletions.
4 changes: 1 addition & 3 deletions bolt/Passes/HFSort.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,7 @@ std::vector<Cluster> clusterize(const CallGraph &Cg);
/*
* Optimize function placement for iTLB cache and i-cache.
*/
std::vector<Cluster> hfsortPlus(CallGraph &Cg,
bool UseGainCache = true,
bool UseShortCallCache = true);
std::vector<Cluster> hfsortPlus(CallGraph &Cg, bool UseGainCache = true);

/*
* Pettis-Hansen code layout algorithm
Expand Down
102 changes: 20 additions & 82 deletions bolt/Passes/HFSortPlus.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ using namespace llvm;
using namespace bolt;

namespace opts {
extern cl::OptionCategory BoltCategory;

extern cl::OptionCategory BoltOptCategory;
extern cl::opt<bool> Verbosity;

Expand Down Expand Up @@ -92,17 +92,6 @@ int32_t ITLBPageSize;
// while smaller values result in better i-cache performance
int32_t ITLBEntries;

const char* cacheKindString(bool UseGainCache, bool UseShortCallCache) {
if (UseGainCache && UseShortCallCache)
return "gain + short call cache";
else if (UseGainCache)
return "gain cache";
else if (UseShortCallCache)
return "short call cache";
else
return "no cache";
}

// This class maintains adjacency information for all Clusters being
// processed. It is used to invalidate cache entries when merging
// Clusters and for visiting all neighbors of any given Cluster.
Expand Down Expand Up @@ -215,17 +204,16 @@ class PrecomputedResults {
Valid[Index] = true;
}

void invalidate(const AdjacencyMatrix &Adjacent, const Cluster *C) {
invalidate(C);
Adjacent.forallAdjacent(C, [&](const Cluster *A) { invalidate(A); });
}
private:
void invalidate(const Cluster *C) {
Valid.reset(C->id() * Size, (C->id() + 1) * Size);
for (size_t Id = 0; Id < Size; Id++) {
Valid.reset(Id * Size + C->id());
}
}

private:
size_t index(const Cluster *First, const Cluster *Second) const {
return (First->id() * Size) + Second->id();
return First->id() * Size + Second->id();
}

size_t Size;
Expand Down Expand Up @@ -347,12 +335,6 @@ class HFSortPlus {
* the same cache page
*/
double shortCalls(const Cluster *Cluster) const {
if (UseShortCallCache) {
auto Itr = ShortCallCache.find(Cluster);
if (Itr != ShortCallCache.end())
return Itr->second;
}

double Calls = 0;
for (auto TargetId : Cluster->targets()) {
for (auto Succ : Cg.successors(TargetId)) {
Expand All @@ -367,10 +349,6 @@ class HFSortPlus {
}
}

if (UseShortCallCache) {
ShortCallCache[Cluster] = Calls;
}

return Calls;
}

Expand All @@ -380,11 +358,6 @@ class HFSortPlus {
*/
double shortCalls(const Cluster *ClusterPred,
const Cluster *ClusterSucc) const {
if (UseShortCallCache &&
ShortCallPairCache.contains(ClusterPred, ClusterSucc)) {
return ShortCallPairCache.get(ClusterPred, ClusterSucc);
}

double Calls = 0;
for (auto TargetId : ClusterPred->targets()) {
for (auto Succ : Cg.successors(TargetId)) {
Expand Down Expand Up @@ -413,10 +386,6 @@ class HFSortPlus {
}
}

if (UseShortCallCache) {
ShortCallPairCache.set(ClusterPred, ClusterSucc, Calls);
}

return Calls;
}

Expand All @@ -434,8 +403,8 @@ class HFSortPlus {
*/
double mergeGain(const Cluster *ClusterPred,
const Cluster *ClusterSucc) const {
if (UseGainCache && Cache.contains(ClusterPred, ClusterSucc)) {
return Cache.get(ClusterPred, ClusterSucc);
if (UseGainCache && GainCache.contains(ClusterPred, ClusterSucc)) {
return GainCache.get(ClusterPred, ClusterSucc);
}

// cache misses on the first cluster
Expand All @@ -460,7 +429,7 @@ class HFSortPlus {
Gain /= std::min(ClusterPred->size(), ClusterSucc->size());

if (UseGainCache) {
Cache.set(ClusterPred, ClusterSucc, Gain);
GainCache.set(ClusterPred, ClusterSucc, Gain);
}

return Gain;
Expand Down Expand Up @@ -513,7 +482,7 @@ class HFSortPlus {
const double ProbOut =
CallsFromPred > 0 ? CallsPredSucc / CallsFromPred : 0;
assert(0.0 <= ProbOut && ProbOut <= 1.0 && "incorrect probability");

// probability that the second cluster is called from the first one
const double ProbIn =
CallsToSucc > 0 ? CallsPredSucc / CallsToSucc : 0;
Expand Down Expand Up @@ -601,13 +570,12 @@ class HFSortPlus {
*/
std::vector<Cluster> run() {
DEBUG(dbgs() << "Starting hfsort+ w/"
<< cacheKindString(UseGainCache, UseShortCallCache)
<< (UseGainCache ? "gain cache" : "no cache")
<< " for " << Clusters.size() << " clusters "
<< "with ITLBPageSize = " << ITLBPageSize << ", "
<< "ITLBEntries = " << ITLBEntries << ", "
<< "and MergeProbability = " << opts::MergeProbability << "\n");


// Pass 1
runPassOne();

Expand All @@ -628,19 +596,15 @@ class HFSortPlus {
return Result;
}

HFSortPlus(const CallGraph &Cg,
bool UseGainCache,
bool UseShortCallCache)
HFSortPlus(const CallGraph &Cg, bool UseGainCache)
: Cg(Cg),
FuncCluster(Cg.numNodes(), nullptr),
Addr(Cg.numNodes(), InvalidAddr),
TotalSamples(0.0),
Clusters(initializeClusters()),
Adjacent(Cg, Clusters, FuncCluster),
UseGainCache(UseGainCache),
UseShortCallCache(UseShortCallCache),
Cache(Clusters.size()),
ShortCallPairCache(Clusters.size()) {
GainCache(Clusters.size()) {
}
private:

Expand Down Expand Up @@ -696,31 +660,16 @@ class HFSortPlus {
CurAddr = ((CurAddr + Align - 1) / Align) * Align;
}

// Update caches
invalidateCaches(Into);
// Invalidate all cache entries associated with cluster Into
if (UseGainCache) {
GainCache.invalidate(Into);
}

// Remove cluster From from the list of active clusters
auto Iter = std::remove(Clusters.begin(), Clusters.end(), From);
Clusters.erase(Iter, Clusters.end());
}

/*
* Invalidate all cache entries associated with cluster C and its neighbors.
*/
void invalidateCaches(const Cluster *C) {
if (UseShortCallCache) {
maybeErase(ShortCallCache, C);
Adjacent.forallAdjacent(C,
[this](const Cluster *A) {
maybeErase(ShortCallCache, A);
});
ShortCallPairCache.invalidate(Adjacent, C);
}
if (UseGainCache) {
Cache.invalidate(Adjacent, C);
}
}

// The call graph
const CallGraph &Cg;

Expand All @@ -746,32 +695,21 @@ class HFSortPlus {
// Use cache for mergeGain results
bool UseGainCache;

// Use caches for shortCalls results
bool UseShortCallCache;

// A cache that keeps precomputed values of mergeGain for pairs of clusters;
// when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
// containing both x and y and all clusters adjacent to x and y (and recompute
// them on the next iteration).
mutable PrecomputedResults Cache;

// Cache for shortCalls for a single cluster.
mutable std::unordered_map<const Cluster *, double> ShortCallCache;

// Cache for shortCalls for a pair of Clusters
mutable PrecomputedResults ShortCallPairCache;
mutable PrecomputedResults GainCache;
};

}

std::vector<Cluster> hfsortPlus(CallGraph &Cg,
bool UseGainCache,
bool UseShortCallCache) {
std::vector<Cluster> hfsortPlus(CallGraph &Cg, bool UseGainCache) {
// It is required that the sum of incoming arc weights is not greater
// than the number of samples for every function.
// Ensuring the call graph obeys the property before running the algorithm.
Cg.adjustArcWeights();
return HFSortPlus(Cg, UseGainCache, UseShortCallCache).run();
return HFSortPlus(Cg, UseGainCache).run();
}

}}
10 changes: 1 addition & 9 deletions bolt/Passes/ReorderFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,14 +119,6 @@ UseGainCache("hfsort+-use-cache",
llvm::cl::Hidden,
llvm::cl::cat(BoltOptCategory));

static llvm::cl::opt<bool>
UseShortCallCache("hfsort+-use-short-call-cache",
llvm::cl::desc("Use a cache for shortCall results when computing hfsort+."),
llvm::cl::ZeroOrMore,
llvm::cl::init(true),
llvm::cl::Hidden,
llvm::cl::cat(BoltOptCategory));

} // namespace opts

namespace llvm {
Expand Down Expand Up @@ -353,7 +345,7 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC,
Clusters = clusterize(Cg);
break;
case BinaryFunction::RT_HFSORT_PLUS:
Clusters = hfsortPlus(Cg, opts::UseGainCache, opts::UseShortCallCache);
Clusters = hfsortPlus(Cg, opts::UseGainCache);
break;
case BinaryFunction::RT_PETTIS_HANSEN:
Clusters = pettisAndHansen(Cg);
Expand Down

0 comments on commit c588b5e

Please sign in to comment.