From 9367c755769a19d40398fd54352591a352c3fec3 Mon Sep 17 00:00:00 2001
From: Hemil K <hkadakia@users.noreply.github.com>
Date: Wed, 9 Mar 2022 11:54:24 -0800
Subject: [PATCH] resolve potential deadlock (#154)

* resolve potential deadlock

* cancel signing request if client cancelled

* cancel work job when client cancels request

* address comments

* address comments
---
 config/config.go           |  2 +-
 pkcs11/signer.go           | 41 +++++++--------------
 pkcs11/signer_test.go      |  8 +++--
 pkcs11/work.go             | 74 ++++++++++++++++++++++++++++----------
 server/scheduler/worker.go | 10 ++++--
 5 files changed, 81 insertions(+), 54 deletions(-)

diff --git a/config/config.go b/config/config.go
index aba8fab4..32b23507 100644
--- a/config/config.go
+++ b/config/config.go
@@ -31,7 +31,7 @@ const (
 	defaultIdleTimeout   = 30
 	defaultReadTimeout   = 10
 	defaultWriteTimeout  = 10
-	DefaultPKCS11Timeout = 10
+	DefaultPKCS11Timeout = 10 // in seconds
 
 	// X509CertEndpoint specifies the endpoint for signing X509 certificate.
 	X509CertEndpoint = "/sig/x509-cert"
diff --git a/pkcs11/signer.go b/pkcs11/signer.go
index 2a893f8d..e0483025 100644
--- a/pkcs11/signer.go
+++ b/pkcs11/signer.go
@@ -42,10 +42,9 @@ import (
    It also has a channel on which it waits for the response.
 */
 type Request struct {
-	pool          sPool                        // pool is a signer pool per identifier from which to fetch the signer
-	identifier    string                       // identifier indicates the endpoint for which we are fetching the signer in order to sign it
-	remainingTime time.Duration                // remainingTime indicates the time remaining before either the client cancels or the request times out.
-	respChan      chan signerWithSignAlgorithm // respChan is the channel where the worker sends the signer once it gets it from the pool
+	pool       sPool                        // pool is a signer pool per identifier from which to fetch the signer
+	identifier string                       // identifier indicates the endpoint for which we are fetching the signer in order to sign it
+	respChan   chan signerWithSignAlgorithm // respChan is the channel where the worker sends the signer once it gets it from the pool
 }
 
 // signer implements crypki.CertSign interface.
@@ -67,19 +66,7 @@ type signer struct {
 	requestTimeout uint
 }
 
-func getRemainingRequestTime(ctx context.Context, keyIdentifier string, requestTimeout uint) (time.Duration, error) {
-	remTime := time.Duration(requestTimeout) * time.Second
-	if deadline, ok := ctx.Deadline(); ok {
-		remTime = time.Until(deadline)
-		if remTime <= 0 {
-			// context expired, we should stop processing and return immediately
-			return 0, fmt.Errorf("context deadline expired for key identifier %q", keyIdentifier)
-		}
-	}
-	return remTime, nil
-}
-
-func getSigner(ctx context.Context, requestChan chan scheduler.Request, pool sPool, keyIdentifier string, priority proto.Priority, requestTimeout uint) (signer signerWithSignAlgorithm, err error) {
+func getSigner(ctx context.Context, requestChan chan scheduler.Request, pool sPool, keyIdentifier string, priority proto.Priority) (signer signerWithSignAlgorithm, err error) {
 	// Need to handle case when we directly invoke SignSSHCert or SignX509Cert for
 	// either generating the host certs or X509 CA certs. In that case we don't need the server
 	// running nor do we need to worry about priority scheduling. In that case, we immediately
@@ -87,16 +74,11 @@ func getSigner(ctx context.Context, requestChan chan scheduler.Request, pool sPo
 	if requestChan == nil {
 		return pool.get(ctx)
 	}
-	remTime, err := getRemainingRequestTime(ctx, keyIdentifier, requestTimeout)
-	if err != nil {
-		return nil, err
-	}
 	respChan := make(chan signerWithSignAlgorithm)
 	req := &Request{
-		pool:          pool,
-		identifier:    keyIdentifier,
-		remainingTime: remTime,
-		respChan:      respChan,
+		pool:       pool,
+		identifier: keyIdentifier,
+		respChan:   respChan,
 	}
 	if priority == proto.Priority_Unspecified_priority {
 		// If priority is unspecified, treat the request as high priority.
@@ -116,7 +98,8 @@ func getSigner(ctx context.Context, requestChan chan scheduler.Request, pool sPo
 			return nil, errors.New("client request timed out, skip signing cert request")
 		}
 	case <-ctx.Done():
-		// In order to ensure we don't keep on blocking on the response, we add this check.
+		// In order to ensure we don't keep on blocking on the response, we close the response channel for this request & return.
+		close(respChan)
 		return nil, ctx.Err()
 	}
 	return signer, nil
@@ -210,7 +193,7 @@ func (s *signer) SignSSHCert(ctx context.Context, reqChan chan scheduler.Request
 		return nil, fmt.Errorf("unknown key identifier %q", keyIdentifier)
 	}
 	pStart := time.Now()
-	signer, err := getSigner(ctx, reqChan, pool, keyIdentifier, priority, s.requestTimeout)
+	signer, err := getSigner(ctx, reqChan, pool, keyIdentifier, priority)
 	if err != nil {
 		pt = time.Since(pStart).Nanoseconds() / time.Microsecond.Nanoseconds()
 		return nil, err
@@ -258,7 +241,7 @@ func (s *signer) SignX509Cert(ctx context.Context, reqChan chan scheduler.Reques
 		return nil, fmt.Errorf("unknown key identifier %q", keyIdentifier)
 	}
 	pStart := time.Now()
-	signer, err := getSigner(ctx, reqChan, pool, keyIdentifier, priority, s.requestTimeout)
+	signer, err := getSigner(ctx, reqChan, pool, keyIdentifier, priority)
 	if err != nil {
 		pt = time.Since(pStart).Nanoseconds() / time.Microsecond.Nanoseconds()
 		return nil, err
@@ -323,7 +306,7 @@ func (s *signer) SignBlob(ctx context.Context, reqChan chan scheduler.Request, d
 		return nil, fmt.Errorf("unknown key identifier %q", keyIdentifier)
 	}
 	pStart := time.Now()
-	signer, err := getSigner(ctx, reqChan, pool, keyIdentifier, priority, s.requestTimeout)
+	signer, err := getSigner(ctx, reqChan, pool, keyIdentifier, priority)
 	if err != nil {
 		pt = time.Since(pStart).Nanoseconds() / time.Microsecond.Nanoseconds()
 		return nil, err
diff --git a/pkcs11/signer_test.go b/pkcs11/signer_test.go
index b474bb7b..9eb31a2b 100644
--- a/pkcs11/signer_test.go
+++ b/pkcs11/signer_test.go
@@ -113,7 +113,7 @@ func dummyScheduler(ctx context.Context, reqChan chan scheduler.Request) {
 		req := <-reqChan
 		go func() {
 			// create worker with different priorities
-			worker := &scheduler.Worker{ID: 1, Priority: req.Priority, Quit: make(chan struct{})}
+			worker := &scheduler.Worker{ID: 1, Priority: req.Priority, Quit: make(chan struct{}), HSMTimeout: 1 * time.Second}
 			req.DoWorker.DoWork(ctx, worker)
 		}()
 	}
@@ -336,7 +336,8 @@ func TestSignX509RSACert(t *testing.T) {
 	cp := x509.NewCertPool()
 	cp.AddCert(caCert)
 
-	ctx := context.Background()
+	ctx, cnc := context.WithTimeout(context.Background(), 1*time.Second)
+	defer cnc()
 	cancelCtx, cancel := context.WithCancel(ctx)
 	defer cancel()
 
@@ -424,7 +425,8 @@ func TestSignX509ECCert(t *testing.T) {
 	cp := x509.NewCertPool()
 	cp.AddCert(caCert)
 
-	ctx := context.Background()
+	ctx, cnc := context.WithTimeout(context.Background(), 1*time.Second)
+	defer cnc()
 	reqChan := make(chan scheduler.Request)
 	testcases := map[string]struct {
 		ctx         context.Context
diff --git a/pkcs11/work.go b/pkcs11/work.go
index e64a212c..26e5a48c 100644
--- a/pkcs11/work.go
+++ b/pkcs11/work.go
@@ -26,30 +26,68 @@ type Work struct {
 	work *Request // workChan is a channel which has a request enqueue for the worker to work on.
 }
 
-//DoWork performs the work of fetching the signer from the pool and sending it back on the response channel
+// DoWork performs the work of fetching the signer from the pool and sending it back on the response channel.
+// If the client cancels the request or times out, the worker should not wait indefinitely for getting the signer
+// from the pool.We also have a PKCS11 timeout which is the maximum duration for which worker waits to fetch the
+// signer from pool & cancel the client request if it exceeds that.
 func (w *Work) DoWork(workerCtx context.Context, worker *scheduler.Worker) {
-	select {
-	case <-workerCtx.Done():
-		log.Printf("%s: worker stopped", worker.String())
-		return
-	default:
-		reqCtx, cancel := context.WithTimeout(context.Background(), w.work.remainingTime)
-		defer cancel()
+	reqCtx, cancel := context.WithTimeout(context.Background(), worker.HSMTimeout)
+	type resp struct {
+		signer signerWithSignAlgorithm
+		err    error
+	}
+
+	signerRespCh := make(chan resp)
+	go func(ctx context.Context) {
 		signer, err := w.work.pool.get(reqCtx)
-		if err != nil {
-			worker.TotalTimeout.Inc()
-			log.Printf("%s: error fetching signer %v", worker.String(), err)
-			w.work.respChan <- nil
+		select {
+		case signerRespCh <- resp{signer, err}:
+		case <-ctx.Done():
 			return
 		}
+	}(workerCtx)
+
+	for {
 		select {
-		case <-reqCtx.Done():
-			// request timed out, increment timeout context & return nil.
-			worker.TotalTimeout.Inc()
-			w.work.respChan <- nil
-		default:
+		case <-workerCtx.Done():
+			// Case 1: Worker stopped either due to context cancelled or worker timed out.
+			// This case is to avoid worker being stuck in a blocking call or a deadlock scenario.
+			log.Printf("%s: worker stopped", worker.String())
+			cancel()
+			w.sendResponse(nil)
+			return
+		case resp := <-signerRespCh:
+			// Case 2: Received response. It could either be a pkcs11 timeout or thr worker was able to get a signer
+			// from the signer pool.
+			if resp.signer == nil || resp.err != nil {
+				worker.TotalTimeout.Inc()
+				log.Printf("%s: error fetching signer %v", worker.String(), resp.err)
+				w.work.respChan <- nil
+				cancel()
+				return
+			}
 			worker.TotalProcessed.Inc()
-			w.work.respChan <- signer
+			w.sendResponse(resp.signer)
+			cancel()
+			return
+		case _, ok := <-w.work.respChan:
+			// Case 3: Check for current state of respChan. If the client request is cancelled, the client
+			// will close the respChan. In that case, we should cancel reqCtx & return to avoid extra processing.
+			if !ok {
+				log.Printf("%s: worker request timed out, client cancelled request", worker.String())
+				cancel()
+				worker.TotalTimeout.Inc()
+				return
+			}
 		}
 	}
 }
+
+// sendResponse sends the response on the respChan if the channel is not yet closed by the client.
+func (w *Work) sendResponse(resp signerWithSignAlgorithm) {
+	select {
+	case <-w.work.respChan:
+	default:
+		w.work.respChan <- resp
+	}
+}
diff --git a/server/scheduler/worker.go b/server/scheduler/worker.go
index 24a1086a..6dc52923 100644
--- a/server/scheduler/worker.go
+++ b/server/scheduler/worker.go
@@ -17,7 +17,9 @@ import (
 	"context"
 	"fmt"
 	"log"
+	"time"
 
+	"github.com/theparanoids/crypki/config"
 	"github.com/theparanoids/crypki/proto"
 )
 
@@ -36,6 +38,7 @@ type Request struct {
 type Worker struct {
 	ID             int            // ID is a unique id for the worker
 	Priority       proto.Priority // Priority indicates the priority of the request the worker is handling.
+	HSMTimeout     time.Duration  // HSMTimeout is the max time a worker can wait to get signer from pool.
 	TotalProcessed Counter        // TotalProcessed indicates the total requests processed per priority by this worker.
 	TotalTimeout   Counter        // TotalTimeout indicates the total requests that timed out before worker could process it.
 	Quit           chan struct{}  // Quit is a channel to cancel the worker
@@ -56,9 +59,10 @@ func (w *Worker) String() string {
 // that the worker can add itself to when it is idle. It also creates a slice for storing totalProcessed requests.
 func newWorker(workerId int, workerPriority proto.Priority) *Worker {
 	return &Worker{
-		ID:       workerId,
-		Priority: workerPriority,
-		Quit:     make(chan struct{}),
+		ID:         workerId,
+		Priority:   workerPriority,
+		HSMTimeout: config.DefaultPKCS11Timeout * time.Second,
+		Quit:       make(chan struct{}),
 	}
 }