Skip to content

Commit

Permalink
Merge pull request #155 from pohanhuangtw/NVSHAS-9189
Browse files Browse the repository at this point in the history
[NVSHAS-9189] Scan will stuck in scheduling after controller is shutdown and restarted
  • Loading branch information
jayhuang-suse authored Aug 1, 2024
2 parents 49f92bd + bf28afe commit bfc458d
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 2 deletions.
18 changes: 16 additions & 2 deletions scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ var cveDB *common.CveDB
var ctrlCaps share.ControllerCaps
var scanTasker *Tasker
var selfID string
var isGetCapsActivate bool

func dbRead(path string, maxRetry int, output string) map[string]*share.ScanVulnerability {
dbFile := path + share.DefaultCVEDBName
Expand Down Expand Up @@ -98,12 +99,14 @@ func dbRead(path string, maxRetry int, output string) map[string]*share.ScanVuln
}
}

func connectController(path, advIP, joinIP, selfID string, advPort uint32, joinPort uint16) {
func connectController(path, advIP, joinIP, selfID string, advPort uint32, joinPort uint16, doneCh chan bool) {
cb := &clientCallback{
shutCh: make(chan interface{}, 1),
ignoreShutdown: true,
}

var healthCheckCh chan struct{}

for {
// forever retry
dbData := dbRead(path, 0, "")
Expand All @@ -124,6 +127,17 @@ func connectController(path, advIP, joinIP, selfID string, advPort uint32, joinP
scanner.CVEDB = nil
dbData = make(map[string]*share.ScanVulnerability) // zero size

if healthCheckCh != nil {
close(healthCheckCh)
}

healthCheckCh = make(chan struct{})
// Check if the gRPC HealthCheck API is active (indicated by isGetCapsActivate being true).
// If active, initiate periodic health checks by launching a goroutine to monitor the health status of the specified service.
if isGetCapsActivate {
go periodCheckHealth(joinIP, joinPort, &scanner, cb, healthCheckCh, doneCh)
}

// start responding shutdown notice
cb.ignoreShutdown = false
<-cb.shutCh
Expand Down Expand Up @@ -416,7 +430,7 @@ func main() {

// Use the original address, which is the service name, so when controller changes,
// new IP can be resolved
go connectController(*dbPath, *adv, *join, selfID, (uint32)(*advPort), (uint16)(*joinPort))
go connectController(*dbPath, *adv, *join, selfID, (uint32)(*advPort), (uint16)(*joinPort), done)
<-done

log.Info("Exiting ...")
Expand Down
53 changes: 53 additions & 0 deletions server.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ import (
"github.com/neuvector/scanner/cvetools"
)

const (
period = 20 // Minutes to check if the scanner is in the controller and controller is alive
retryMax = 3 // Number of retry
)

func createEnforcerScanServiceWrapper(conn *grpc.ClientConn) cluster.Service {
return share.NewEnforcerScanServiceClient(conn)
}
Expand Down Expand Up @@ -365,8 +370,10 @@ func scannerRegister(joinIP string, joinPort uint16, data *share.ScannerRegister

caps, err := client.GetCaps(ctx, &share.RPCVoid{})
if err != nil {
isGetCapsActivate = false
downgradeCriticalSeverityInCVEDB(data)
} else {
isGetCapsActivate = true
ctrlCaps = *caps
if !caps.CriticalVul {
downgradeCriticalSeverityInCVEDB(data)
Expand Down Expand Up @@ -404,3 +411,49 @@ func scannerDeregister(joinIP string, joinPort uint16, id string) error {
}
return nil
}

func getScannerAvailable(joinIP string, joinPort uint16, data *share.ScannerRegisterData, cb cluster.GRPCCallback) (*share.ScannerAvailable, error) {
client, err := getControllerServiceClient(joinIP, joinPort, cb)
if err != nil {
log.WithFields(log.Fields{"error": err}).Error("Failed to find ctrl client")
return &share.ScannerAvailable{Visible: false}, errors.New("Failed to connect to controller")
}

ctx, cancel := context.WithTimeout(context.Background(), time.Second*60)
defer cancel()

scannerAvailable, errHealthCheck := client.HealthCheck(ctx, data)

return scannerAvailable, errHealthCheck
}

// To ensure the controller's availability, periodCheckHealth use HealthCheck to periodically check if the controller is alive.
// Additionally, if the controller is deleted or not responsive, the scanner will re-register.
func periodCheckHealth(joinIP string, joinPort uint16, data *share.ScannerRegisterData, cb *clientCallback, healthCheckCh chan struct{}, done chan bool) {
ticker := time.NewTicker(time.Duration(period) * time.Minute)
defer ticker.Stop()
for {
select {
case <-ticker.C:
retryCnt := 0
for retryCnt < retryMax {
scannerAvailable, errHealthCheck := getScannerAvailable(joinIP, joinPort, data, cb)
if errHealthCheck == nil {
if scannerAvailable.Visible {
break
}
} else {
log.WithFields(log.Fields{"joinIP": joinIP, "joinPort": joinPort, "errHealthCheck": errHealthCheck}).Debug("periodCheckHealth has error")
}
retryCnt++
time.Sleep(time.Duration(period) * time.Second) // Add a delay before retrying
}
if retryCnt >= retryMax {
log.WithFields(log.Fields{"joinIP": joinIP, "joinPort": joinPort, "retryMax": retryMax}).Error("The scanner is not in the controller, restart the scanner pod.")
done <- true
}
case <-healthCheckCh:
return
}
}
}

0 comments on commit bfc458d

Please sign in to comment.