From c588b5eaf55470d90a77fa63f465d1b599223b1f Mon Sep 17 00:00:00 2001
From: spupyrev <spupyrev@fb.com>
Date: Wed, 15 Nov 2017 14:17:39 -0800
Subject: [PATCH] speeding up caches for hfsort+

Summary:
When running hfsort+, we invalidate too many cache entries, which leads to inefficiencies. It seems we only need to invalidate cache for pairs of clusters (Into, X) and (X, Into) when modifying cluster Into (for all clusters X).
With the modification, we do not really need ShortCache, since it is computed only once per pair of clusters.

(cherry picked from FBD6341039)
---
 bolt/Passes/HFSort.h             |   4 +-
 bolt/Passes/HFSortPlus.cpp       | 102 ++++++-------------------------
 bolt/Passes/ReorderFunctions.cpp |  10 +--
 3 files changed, 22 insertions(+), 94 deletions(-)
diff --git a/bolt/Passes/HFSort.h b/bolt/Passes/HFSort.h
index 7c837e029397..2329ec171417 100644
--- a/bolt/Passes/HFSort.h
+++ b/bolt/Passes/HFSort.h
@@ -103,9 +103,7 @@ std::vector<Cluster> clusterize(const CallGraph &Cg);
 /*
  * Optimize function placement for iTLB cache and i-cache.
  */
-std::vector<Cluster> hfsortPlus(CallGraph &Cg,
-                                bool UseGainCache = true,
-                                bool UseShortCallCache = true);
+std::vector<Cluster> hfsortPlus(CallGraph &Cg, bool UseGainCache = true);
 
 /*
  * Pettis-Hansen code layout algorithm
diff --git a/bolt/Passes/HFSortPlus.cpp b/bolt/Passes/HFSortPlus.cpp
index d7006af2d005..fb8f2cbcf2c2 100644
--- a/bolt/Passes/HFSortPlus.cpp
+++ b/bolt/Passes/HFSortPlus.cpp
@@ -46,7 +46,7 @@ using namespace llvm;
 using namespace bolt;
 
 namespace opts {
-extern cl::OptionCategory BoltCategory;
+
 extern cl::OptionCategory BoltOptCategory;
 extern cl::opt<bool> Verbosity;
 
@@ -92,17 +92,6 @@ int32_t ITLBPageSize;
 // while smaller values result in better i-cache performance
 int32_t ITLBEntries;
 
-const char* cacheKindString(bool UseGainCache, bool UseShortCallCache) {
-  if (UseGainCache && UseShortCallCache)
-    return "gain + short call cache";
-  else if (UseGainCache)
-    return "gain cache";
-  else if (UseShortCallCache)
-    return "short call cache";
-  else
-    return "no cache";
-}
-
 // This class maintains adjacency information for all Clusters being
 // processed.  It is used to invalidate cache entries when merging
 // Clusters and for visiting all neighbors of any given Cluster.
@@ -215,17 +204,16 @@ class PrecomputedResults {
     Valid[Index] = true;
   }
 
-  void invalidate(const AdjacencyMatrix &Adjacent, const Cluster *C) {
-    invalidate(C);
-    Adjacent.forallAdjacent(C, [&](const Cluster *A) { invalidate(A); });
-  }
- private:
   void invalidate(const Cluster *C) {
     Valid.reset(C->id() * Size, (C->id() + 1) * Size);
+    for (size_t Id = 0; Id < Size; Id++) {
+      Valid.reset(Id * Size + C->id());
+    }
   }
 
+ private:
   size_t index(const Cluster *First, const Cluster *Second) const {
-    return (First->id() * Size) + Second->id();
+    return First->id() * Size + Second->id();
   }
 
   size_t Size;
@@ -347,12 +335,6 @@ class HFSortPlus {
    * the same cache page
    */
   double shortCalls(const Cluster *Cluster) const {
-    if (UseShortCallCache) {
-      auto Itr = ShortCallCache.find(Cluster);
-      if (Itr != ShortCallCache.end())
-        return Itr->second;
-    }
-
     double Calls = 0;
     for (auto TargetId : Cluster->targets()) {
       for (auto Succ : Cg.successors(TargetId)) {
@@ -367,10 +349,6 @@ class HFSortPlus {
       }
     }
 
-    if (UseShortCallCache) {
-      ShortCallCache[Cluster] = Calls;
-    }
-
     return Calls;
   }
 
@@ -380,11 +358,6 @@ class HFSortPlus {
    */
   double shortCalls(const Cluster *ClusterPred,
                     const Cluster *ClusterSucc) const {
-    if (UseShortCallCache &&
-        ShortCallPairCache.contains(ClusterPred, ClusterSucc)) {
-      return ShortCallPairCache.get(ClusterPred, ClusterSucc);
-    }
-
     double Calls = 0;
     for (auto TargetId : ClusterPred->targets()) {
       for (auto Succ : Cg.successors(TargetId)) {
@@ -413,10 +386,6 @@ class HFSortPlus {
       }
     }
 
-    if (UseShortCallCache) {
-      ShortCallPairCache.set(ClusterPred, ClusterSucc, Calls);
-    }
-
     return Calls;
   }
 
@@ -434,8 +403,8 @@ class HFSortPlus {
    */
   double mergeGain(const Cluster *ClusterPred,
                    const Cluster *ClusterSucc) const {
-    if (UseGainCache && Cache.contains(ClusterPred, ClusterSucc)) {
-      return Cache.get(ClusterPred, ClusterSucc);
+    if (UseGainCache && GainCache.contains(ClusterPred, ClusterSucc)) {
+      return GainCache.get(ClusterPred, ClusterSucc);
     }
 
     // cache misses on the first cluster
@@ -460,7 +429,7 @@ class HFSortPlus {
     Gain /= std::min(ClusterPred->size(), ClusterSucc->size());
 
     if (UseGainCache) {
-      Cache.set(ClusterPred, ClusterSucc, Gain);
+      GainCache.set(ClusterPred, ClusterSucc, Gain);
     }
 
     return Gain;
@@ -513,7 +482,7 @@ class HFSortPlus {
           const double ProbOut =
             CallsFromPred > 0 ? CallsPredSucc / CallsFromPred : 0;
           assert(0.0 <= ProbOut && ProbOut <= 1.0 && "incorrect probability");
-          
+
           // probability that the second cluster is called from the first one
           const double ProbIn =
             CallsToSucc > 0 ? CallsPredSucc / CallsToSucc : 0;
@@ -601,13 +570,12 @@ class HFSortPlus {
    */
   std::vector<Cluster> run() {
     DEBUG(dbgs() << "Starting hfsort+ w/"
-                 << cacheKindString(UseGainCache, UseShortCallCache)
+                 << (UseGainCache ? "gain cache" : "no cache")
                  << " for " << Clusters.size() << " clusters "
                  << "with ITLBPageSize = " << ITLBPageSize << ", "
                  << "ITLBEntries = " << ITLBEntries << ", "
                  << "and MergeProbability = " << opts::MergeProbability << "\n");
 
-
     // Pass 1
     runPassOne();
 
@@ -628,9 +596,7 @@ class HFSortPlus {
     return Result;
   }
 
-  HFSortPlus(const CallGraph &Cg,
-             bool UseGainCache,
-             bool UseShortCallCache)
+  HFSortPlus(const CallGraph &Cg, bool UseGainCache)
   : Cg(Cg),
     FuncCluster(Cg.numNodes(), nullptr),
     Addr(Cg.numNodes(), InvalidAddr),
@@ -638,9 +604,7 @@ class HFSortPlus {
     Clusters(initializeClusters()),
     Adjacent(Cg, Clusters, FuncCluster),
     UseGainCache(UseGainCache),
-    UseShortCallCache(UseShortCallCache),
-    Cache(Clusters.size()),
-    ShortCallPairCache(Clusters.size()) {
+    GainCache(Clusters.size()) {
   }
 private:
 
@@ -696,31 +660,16 @@ class HFSortPlus {
       CurAddr = ((CurAddr + Align - 1) / Align) * Align;
     }
 
-    // Update caches
-    invalidateCaches(Into);
+    // Invalidate all cache entries associated with cluster Into
+    if (UseGainCache) {
+      GainCache.invalidate(Into);
+    }
 
     // Remove cluster From from the list of active clusters
     auto Iter = std::remove(Clusters.begin(), Clusters.end(), From);
     Clusters.erase(Iter, Clusters.end());
   }
 
-  /*
-   * Invalidate all cache entries associated with cluster C and its neighbors.
-   */
-  void invalidateCaches(const Cluster *C) {
-    if (UseShortCallCache) {
-      maybeErase(ShortCallCache, C);
-      Adjacent.forallAdjacent(C,
-        [this](const Cluster *A) {
-          maybeErase(ShortCallCache, A);
-        });
-      ShortCallPairCache.invalidate(Adjacent, C);
-    }
-    if (UseGainCache) {
-      Cache.invalidate(Adjacent, C);
-    }
-  }
-
   // The call graph
   const CallGraph &Cg;
 
@@ -746,32 +695,21 @@ class HFSortPlus {
   // Use cache for mergeGain results
   bool UseGainCache;
 
-  // Use caches for shortCalls results
-  bool UseShortCallCache;
-
   // A cache that keeps precomputed values of mergeGain for pairs of clusters;
   // when a pair of clusters (x,y) gets merged, we need to invalidate the pairs
   // containing both x and y and all clusters adjacent to x and y (and recompute
   // them on the next iteration).
-  mutable PrecomputedResults Cache;
-
-  // Cache for shortCalls for a single cluster.
-  mutable std::unordered_map<const Cluster *, double> ShortCallCache;
-
-  // Cache for shortCalls for a pair of Clusters
-  mutable PrecomputedResults ShortCallPairCache;
+  mutable PrecomputedResults GainCache;
 };
 
 }
 
-std::vector<Cluster> hfsortPlus(CallGraph &Cg,
-                                bool UseGainCache,
-                                bool UseShortCallCache) {
+std::vector<Cluster> hfsortPlus(CallGraph &Cg, bool UseGainCache) {
   // It is required that the sum of incoming arc weights is not greater
   // than the number of samples for every function.
   // Ensuring the call graph obeys the property before running the algorithm.
   Cg.adjustArcWeights();
-  return HFSortPlus(Cg, UseGainCache, UseShortCallCache).run();
+  return HFSortPlus(Cg, UseGainCache).run();
 }
 
 }}
diff --git a/bolt/Passes/ReorderFunctions.cpp b/bolt/Passes/ReorderFunctions.cpp
index 4676c1c2fa8a..bf4f178e2259 100644
--- a/bolt/Passes/ReorderFunctions.cpp
+++ b/bolt/Passes/ReorderFunctions.cpp
@@ -119,14 +119,6 @@ UseGainCache("hfsort+-use-cache",
   llvm::cl::Hidden,
   llvm::cl::cat(BoltOptCategory));
 
-static llvm::cl::opt<bool>
-UseShortCallCache("hfsort+-use-short-call-cache",
-  llvm::cl::desc("Use a cache for shortCall results when computing hfsort+."),
-  llvm::cl::ZeroOrMore,
-  llvm::cl::init(true),
-  llvm::cl::Hidden,
-  llvm::cl::cat(BoltOptCategory));
-
 } // namespace opts
 
 namespace llvm {
@@ -353,7 +345,7 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC,
     Clusters = clusterize(Cg);
     break;
   case BinaryFunction::RT_HFSORT_PLUS:
-    Clusters = hfsortPlus(Cg, opts::UseGainCache, opts::UseShortCallCache);
+    Clusters = hfsortPlus(Cg, opts::UseGainCache);
     break;
   case BinaryFunction::RT_PETTIS_HANSEN:
     Clusters = pettisAndHansen(Cg);