From d0d32d7a8db93608045fa259d38043266fe27f65 Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Thu, 12 Dec 2024 13:52:33 +0800 Subject: [PATCH] checker, statistic: avoid leak in label statistic (#8703) (#8898) close tikv/pd#8700 Signed-off-by: lhy1024 Co-authored-by: lhy1024 Co-authored-by: ti-chi-bot[bot] <108142056+ti-chi-bot[bot]@users.noreply.github.com> --- pkg/cluster/cluster.go | 2 +- pkg/mcs/scheduling/server/cluster.go | 3 +-- pkg/statistics/region_collection.go | 24 +++++++++++++---- server/cluster/cluster_test.go | 36 ++++++++++++++++++++++--- server/cluster/scheduling_controller.go | 1 + 5 files changed, 55 insertions(+), 11 deletions(-) diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index 916200bfa3e..b3873cf8cee 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -49,7 +49,7 @@ func HandleOverlaps(c Cluster, overlaps []*core.RegionInfo) { if c.GetRegionStats() != nil { c.GetRegionStats().ClearDefunctRegion(item.GetID()) } - c.GetLabelStats().ClearDefunctRegion(item.GetID()) + c.GetLabelStats().MarkDefunctRegion(item.GetID()) c.GetRuleManager().InvalidCache(item.GetID()) } } diff --git a/pkg/mcs/scheduling/server/cluster.go b/pkg/mcs/scheduling/server/cluster.go index 647706b0afd..c05bdb78921 100644 --- a/pkg/mcs/scheduling/server/cluster.go +++ b/pkg/mcs/scheduling/server/cluster.go @@ -355,13 +355,12 @@ func (c *Cluster) waitSchedulersInitialized() { } } -// TODO: implement the following methods - // UpdateRegionsLabelLevelStats updates the status of the region label level by types. func (c *Cluster) UpdateRegionsLabelLevelStats(regions []*core.RegionInfo) { for _, region := range regions { c.labelStats.Observe(region, c.getStoresWithoutLabelLocked(region, core.EngineKey, core.EngineTiFlash), c.persistConfig.GetLocationLabels()) } + c.labelStats.ClearDefunctRegions() } func (c *Cluster) getStoresWithoutLabelLocked(region *core.RegionInfo, key, value string) []*core.StoreInfo { diff --git a/pkg/statistics/region_collection.go b/pkg/statistics/region_collection.go index 52655b093a2..cced5e9149b 100644 --- a/pkg/statistics/region_collection.go +++ b/pkg/statistics/region_collection.go @@ -306,6 +306,7 @@ type LabelStatistics struct { syncutil.RWMutex regionLabelStats map[uint64]string labelCounter map[string]int + defunctRegions map[uint64]struct{} } // NewLabelStatistics creates a new LabelStatistics. @@ -313,6 +314,7 @@ func NewLabelStatistics() *LabelStatistics { return &LabelStatistics{ regionLabelStats: make(map[uint64]string), labelCounter: make(map[string]int), + defunctRegions: make(map[uint64]struct{}), } } @@ -346,14 +348,26 @@ func ResetLabelStatsMetrics() { regionLabelLevelGauge.Reset() } -// ClearDefunctRegion is used to handle the overlap region. -func (l *LabelStatistics) ClearDefunctRegion(regionID uint64) { +// MarkDefunctRegion is used to handle the overlap region. +// It is used to mark the region as defunct and remove it from the label statistics later. +func (l *LabelStatistics) MarkDefunctRegion(regionID uint64) { l.Lock() defer l.Unlock() - if label, ok := l.regionLabelStats[regionID]; ok { - l.labelCounter[label]-- - delete(l.regionLabelStats, regionID) + l.defunctRegions[regionID] = struct{}{} +} + +// ClearDefunctRegions is used to handle the overlap region. +// It is used to remove the defunct regions from the label statistics. +func (l *LabelStatistics) ClearDefunctRegions() { + l.Lock() + defer l.Unlock() + for regionID := range l.defunctRegions { + if label, ok := l.regionLabelStats[regionID]; ok { + l.labelCounter[label]-- + delete(l.regionLabelStats, regionID) + } } + l.defunctRegions = make(map[uint64]struct{}) } // GetLabelCounter is only used for tests. diff --git a/server/cluster/cluster_test.go b/server/cluster/cluster_test.go index c65369fa63a..006edab924a 100644 --- a/server/cluster/cluster_test.go +++ b/server/cluster/cluster_test.go @@ -1118,6 +1118,7 @@ func TestRegionLabelIsolationLevel(t *testing.T) { opt.SetReplicationConfig(cfg) re.NoError(err) cluster := newTestRaftCluster(ctx, mockid.NewIDAllocator(), opt, storage.NewStorageWithMemoryBackend()) + cluster.coordinator = schedule.NewCoordinator(ctx, cluster, nil) for i := uint64(1); i <= 4; i++ { var labels []*metapb.StoreLabel @@ -1152,13 +1153,42 @@ func TestRegionLabelIsolationLevel(t *testing.T) { StartKey: []byte{byte(1)}, EndKey: []byte{byte(2)}, } - r := core.NewRegionInfo(region, peers[0]) - re.NoError(cluster.putRegion(r)) + r1 := core.NewRegionInfo(region, peers[0]) + re.NoError(cluster.putRegion(r1)) - cluster.UpdateRegionsLabelLevelStats([]*core.RegionInfo{r}) + cluster.UpdateRegionsLabelLevelStats([]*core.RegionInfo{r1}) counter := cluster.labelStats.GetLabelCounter() re.Equal(0, counter["none"]) re.Equal(1, counter["zone"]) + + region = &metapb.Region{ + Id: 10, + Peers: peers, + StartKey: []byte{byte(2)}, + EndKey: []byte{byte(3)}, + } + r2 := core.NewRegionInfo(region, peers[0]) + re.NoError(cluster.putRegion(r2)) + + cluster.UpdateRegionsLabelLevelStats([]*core.RegionInfo{r2}) + counter = cluster.labelStats.GetLabelCounter() + re.Equal(0, counter["none"]) + re.Equal(2, counter["zone"]) + + // issue: https://github.com/tikv/pd/issues/8700 + // step1: heartbeat a overlap region, which is used to simulate the case that the region is merged. + // step2: update region 9 and region 10, which is used to simulate the case that patrol is triggered. + // We should only count region 9. + overlapRegion := r1.Clone( + core.WithStartKey(r1.GetStartKey()), + core.WithEndKey(r2.GetEndKey()), + core.WithLeader(r2.GetPeer(8)), + ) + re.NoError(cluster.HandleRegionHeartbeat(overlapRegion)) + cluster.UpdateRegionsLabelLevelStats([]*core.RegionInfo{r1, r2}) + counter = cluster.labelStats.GetLabelCounter() + re.Equal(0, counter["none"]) + re.Equal(1, counter["zone"]) } func heartbeatRegions(re *require.Assertions, cluster *RaftCluster, regions []*core.RegionInfo) { diff --git a/server/cluster/scheduling_controller.go b/server/cluster/scheduling_controller.go index 6a63060022f..4bb6f349ef5 100644 --- a/server/cluster/scheduling_controller.go +++ b/server/cluster/scheduling_controller.go @@ -236,6 +236,7 @@ func (sc *schedulingController) UpdateRegionsLabelLevelStats(regions []*core.Reg for _, region := range regions { sc.labelStats.Observe(region, sc.getStoresWithoutLabelLocked(region, core.EngineKey, core.EngineTiFlash), sc.opt.GetLocationLabels()) } + sc.labelStats.ClearDefunctRegions() } func (sc *schedulingController) getStoresWithoutLabelLocked(region *core.RegionInfo, key, value string) []*core.StoreInfo {