Add lock expiry handler to expire state locks (#8562)

2019-11-25 16:39:43 -08:00
parent e542084c37
commit 720442b1a2
7 changed files with 182 additions and 7 deletions
--- a/cmd/lock-rest-server.go
+++ b/cmd/lock-rest-server.go
@@ -18,6 +18,7 @@ package cmd

 import (
 	"errors"
+	"math/rand"
 	"net/http"
 	"path"
 	"time"
@@ -127,6 +128,131 @@ func (l *lockRESTServer) RUnlockHandler(w http.ResponseWriter, r *http.Request)
 	}
 }

+// ExpiredHandler - query expired lock status.
+func (l *lockRESTServer) ExpiredHandler(w http.ResponseWriter, r *http.Request) {
+	if !l.IsValid(w, r) {
+		l.writeErrorResponse(w, errors.New("Invalid request"))
+		return
+	}
+
+	lockArgs := getLockArgs(r)
+
+	l.ll.mutex.Lock()
+	defer l.ll.mutex.Unlock()
+	// Lock found, proceed to verify if belongs to given uid.
+	if lri, ok := l.ll.lockMap[lockArgs.Resource]; ok {
+		// Check whether uid is still active
+		for _, entry := range lri {
+			if entry.UID == lockArgs.UID {
+				l.writeErrorResponse(w, errLockNotExpired)
+				return
+			}
+		}
+	}
+}
+
+// nameLockRequesterInfoPair is a helper type for lock maintenance
+type nameLockRequesterInfoPair struct {
+	name string
+	lri  lockRequesterInfo
+}
+
+// getLongLivedLocks returns locks that are older than a certain time and
+// have not been 'checked' for validity too soon enough
+func getLongLivedLocks(interval time.Duration) map[Endpoint][]nameLockRequesterInfoPair {
+	nlripMap := make(map[Endpoint][]nameLockRequesterInfoPair)
+	for endpoint, locker := range globalLockServers {
+		rslt := []nameLockRequesterInfoPair{}
+		locker.mutex.Lock()
+		for name, lriArray := range locker.lockMap {
+			for idx := range lriArray {
+				// Check whether enough time has gone by since last check
+				if time.Since(lriArray[idx].TimeLastCheck) >= interval {
+					rslt = append(rslt, nameLockRequesterInfoPair{name: name, lri: lriArray[idx]})
+					lriArray[idx].TimeLastCheck = UTCNow()
+				}
+			}
+		}
+		nlripMap[endpoint] = rslt
+		locker.mutex.Unlock()
+	}
+	return nlripMap
+}
+
+// lockMaintenance loops over locks that have been active for some time and checks back
+// with the original server whether it is still alive or not
+//
+// Following logic inside ignores the errors generated for Dsync.Active operation.
+// - server at client down
+// - some network error (and server is up normally)
+//
+// We will ignore the error, and we will retry later to get a resolve on this lock
+func lockMaintenance(interval time.Duration) {
+	// Validate if long lived locks are indeed clean.
+	// Get list of long lived locks to check for staleness.
+	for lendpoint, nlrips := range getLongLivedLocks(interval) {
+		for _, nlrip := range nlrips {
+			for _, ep := range globalEndpoints {
+				for _, endpoint := range ep.Endpoints {
+					if endpoint.String() == lendpoint.String() {
+						continue
+					}
+
+					c := newLockAPI(endpoint)
+					if !c.IsOnline() {
+						continue
+					}
+
+					// Call back to original server verify whether the lock is still active (based on name & uid)
+					expired, err := c.Expired(dsync.LockArgs{
+						UID:      nlrip.lri.UID,
+						Resource: nlrip.name,
+					})
+
+					if err != nil {
+						c.Close()
+						continue
+					}
+
+					// For successful response, verify if lock was indeed active or stale.
+					if expired {
+						// The lock is no longer active at server that originated
+						// the lock, attempt to remove the lock.
+						globalLockServers[lendpoint].mutex.Lock()
+						// Purge the stale entry if it exists.
+						globalLockServers[lendpoint].removeEntryIfExists(nlrip)
+						globalLockServers[lendpoint].mutex.Unlock()
+					}
+
+					// Close the connection regardless of the call response.
+					c.Close()
+				}
+			}
+		}
+	}
+}
+
+// Start lock maintenance from all lock servers.
+func startLockMaintenance() {
+	// Start with random sleep time, so as to avoid "synchronous checks" between servers
+	time.Sleep(time.Duration(rand.Float64() * float64(lockMaintenanceInterval)))
+
+	// Initialize a new ticker with a minute between each ticks.
+	ticker := time.NewTicker(lockMaintenanceInterval)
+	// Stop the timer upon service closure and cleanup the go-routine.
+	defer ticker.Stop()
+
+	for {
+		// Verifies every minute for locks held more than 2 minutes.
+		select {
+		case <-GlobalServiceDoneCh:
+			return
+		case <-ticker.C:
+			lockMaintenance(lockValidityCheckInterval)
+		}
+	}
+}
+
 // registerLockRESTHandlers - register lock rest router.
 func registerLockRESTHandlers(router *mux.Router, endpointZones EndpointZones) {
 	queries := restQueries(lockRESTUID, lockRESTSource, lockRESTResource)
@@ -145,8 +271,11 @@ func registerLockRESTHandlers(router *mux.Router, endpointZones EndpointZones) {
 			subrouter.Methods(http.MethodPost).Path(lockRESTVersionPrefix + lockRESTMethodRLock).HandlerFunc(httpTraceHdrs(lockServer.RLockHandler)).Queries(queries...)
 			subrouter.Methods(http.MethodPost).Path(lockRESTVersionPrefix + lockRESTMethodUnlock).HandlerFunc(httpTraceHdrs(lockServer.UnlockHandler)).Queries(queries...)
 			subrouter.Methods(http.MethodPost).Path(lockRESTVersionPrefix + lockRESTMethodRUnlock).HandlerFunc(httpTraceHdrs(lockServer.RUnlockHandler)).Queries(queries...)
+			subrouter.Methods(http.MethodPost).Path(lockRESTVersionPrefix + lockRESTMethodExpired).HandlerFunc(httpTraceAll(lockServer.ExpiredHandler)).Queries(queries...)

 			globalLockServers[endpoint] = lockServer.ll
 		}
 	}
+
+	go startLockMaintenance()
 }