libs/service: regularize Stop semantics and concurrency primitives (#7809)

2026-01-05 13:05:09 +00:00 · 2022-02-14 08:28:29 -05:00
parent 73f605af3f
commit 824960c565
9 changed files with 226 additions and 116 deletions
--- a/libs/service/service.go
+++ b/libs/service/service.go
@@ -3,7 +3,7 @@ package service
 import (
 	"context"
 	"errors"
-	"sync/atomic"
+	"sync"

 	"github.com/tendermint/tendermint/libs/log"
 )
@@ -30,9 +30,6 @@ type Service interface {
 	// Return true if the service is running
 	IsRunning() bool

-	// String representation of the service
-	String() string
-
 	// Wait blocks until the service is stopped.
 	Wait()
 }
@@ -40,8 +37,6 @@ type Service interface {
 // Implementation describes the implementation that the
 // BaseService implementation wraps.
 type Implementation interface {
-	Service
-
 	// Called by the Services Start Method
 	OnStart(context.Context) error

@@ -57,12 +52,7 @@ Users can override the OnStart/OnStop methods. In the absence of errors, these
 methods are guaranteed to be called at most once. If OnStart returns an error,
 service won't be marked as started, so the user can call Start again.

-A call to Reset will panic, unless OnReset is overwritten, allowing
-OnStart/OnStop to be called again.
-
-The caller must ensure that Start and Stop are not called concurrently.
-
-It is ok to call Stop without calling Start first.
+It is safe, but an error, to call Stop without calling Start first.

 Typical usage:

@@ -80,23 +70,21 @@ Typical usage:
 	}

 	func (fs *FooService) OnStart(ctx context.Context) error {
-		fs.BaseService.OnStart() // Always call the overridden method.
 		// initialize private fields
 		// start subroutines, etc.
 	}

 	func (fs *FooService) OnStop() error {
-		fs.BaseService.OnStop() // Always call the overridden method.
 		// close/destroy private fields
 		// stop subroutines, etc.
 	}
 */
 type BaseService struct {
-	logger  log.Logger
-	name    string
-	started uint32 // atomic
-	stopped uint32 // atomic
-	quit    chan struct{}
+	logger log.Logger
+	name   string
+	mtx    sync.Mutex
+	quit   <-chan (struct{})
+	cancel context.CancelFunc

 	// The "subclass" of BaseService
 	impl Implementation
@@ -107,7 +95,6 @@ func NewBaseService(logger log.Logger, name string, impl Implementation) *BaseSe
 	return &BaseService{
 		logger: logger,
 		name:   name,
-		quit:   make(chan struct{}),
 		impl:   impl,
 	}
 }
@@ -116,83 +103,101 @@ func NewBaseService(logger log.Logger, name string, impl Implementation) *BaseSe
 // returned if the service is already running or stopped.  To restart a
 // stopped service, call Reset.
 func (bs *BaseService) Start(ctx context.Context) error {
-	if atomic.CompareAndSwapUint32(&bs.started, 0, 1) {
-		if atomic.LoadUint32(&bs.stopped) == 1 {
-			bs.logger.Error("not starting service; already stopped", "service", bs.name, "impl", bs.impl.String())
-			atomic.StoreUint32(&bs.started, 0)
-			return ErrAlreadyStopped
-		}
+	bs.mtx.Lock()
+	defer bs.mtx.Unlock()

-		bs.logger.Info("starting service", "service", bs.name, "impl", bs.impl.String())
+	if bs.quit != nil {
+		return ErrAlreadyStarted
+	}

+	select {
+	case <-bs.quit:
+		return ErrAlreadyStopped
+	default:
+		bs.logger.Info("starting service", "service", bs.name, "impl", bs.name)
 		if err := bs.impl.OnStart(ctx); err != nil {
-			// revert flag
-			atomic.StoreUint32(&bs.started, 0)
 			return err
 		}

+		// we need a separate context to ensure that we start
+		// a thread that will get cleaned up and that the
+		// Stop/Wait functions work as expected.
+		srvCtx, cancel := context.WithCancel(context.Background())
+		bs.cancel = cancel
+		bs.quit = srvCtx.Done()
+
 		go func(ctx context.Context) {
 			select {
-			case <-bs.quit:
-				// someone else explicitly called stop
-				// and then we shouldn't.
+			case <-srvCtx.Done():
+				// this means stop was called manually
 				return
 			case <-ctx.Done():
-				// if nothing is running, no need to
-				// shut down again.
-				if !bs.impl.IsRunning() {
-					return
-				}
-
-				// the context was cancel and we
-				// should stop.
-				if err := bs.Stop(); err != nil {
-					bs.logger.Error("stopped service",
-						"err", err.Error(),
-						"service", bs.name,
-						"impl", bs.impl.String())
-				}
-
-				bs.logger.Info("stopped service",
-					"service", bs.name,
-					"impl", bs.impl.String())
+				_ = bs.Stop()
 			}
+
+			bs.logger.Info("stopped service",
+				"service", bs.name)
 		}(ctx)

 		return nil
 	}
-
-	return ErrAlreadyStarted
 }

 // Stop implements Service by calling OnStop (if defined) and closing quit
 // channel. An error will be returned if the service is already stopped.
 func (bs *BaseService) Stop() error {
-	if atomic.CompareAndSwapUint32(&bs.stopped, 0, 1) {
-		if atomic.LoadUint32(&bs.started) == 0 {
-			bs.logger.Error("not stopping service; not started yet", "service", bs.name, "impl", bs.impl.String())
-			atomic.StoreUint32(&bs.stopped, 0)
-			return ErrNotStarted
-		}
+	bs.mtx.Lock()
+	defer bs.mtx.Unlock()

-		bs.logger.Info("stopping service", "service", bs.name, "impl", bs.impl.String())
+	if bs.quit == nil {
+		return ErrNotStarted
+	}
+
+	select {
+	case <-bs.quit:
+		return ErrAlreadyStopped
+	default:
+		bs.logger.Info("stopping service", "service", bs.name)
 		bs.impl.OnStop()
-		close(bs.quit)
+		bs.cancel()

 		return nil
 	}
-
-	return ErrAlreadyStopped
 }

 // IsRunning implements Service by returning true or false depending on the
 // service's state.
 func (bs *BaseService) IsRunning() bool {
-	return atomic.LoadUint32(&bs.started) == 1 && atomic.LoadUint32(&bs.stopped) == 0
+	bs.mtx.Lock()
+	defer bs.mtx.Unlock()
+
+	if bs.quit == nil {
+		return false
+	}
+
+	select {
+	case <-bs.quit:
+		return false
+	default:
+		return true
+	}
+}
+
+func (bs *BaseService) getWait() <-chan struct{} {
+	bs.mtx.Lock()
+	defer bs.mtx.Unlock()
+
+	if bs.quit == nil {
+		out := make(chan struct{})
+		close(out)
+		return out
+	}
+
+	return bs.quit
 }

 // Wait blocks until the service is stopped.
-func (bs *BaseService) Wait() { <-bs.quit }
+func (bs *BaseService) Wait() { <-bs.getWait() }

 // String implements Service by returning a string representation of the service.
 func (bs *BaseService) String() string { return bs.name }
--- a/libs/service/service_test.go
+++ b/libs/service/service_test.go
@@ -2,45 +2,135 @@ package service

 import (
 	"context"
+	"sync"
 	"testing"
 	"time"

+	"github.com/fortytw2/leaktest"
+	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	"github.com/tendermint/tendermint/libs/log"
 )

 type testService struct {
+	started      bool
+	stopped      bool
+	multiStopped bool
+	mu           sync.Mutex
 	BaseService
 }

-func (testService) OnStop() {}
-func (testService) OnStart(context.Context) error {
+func (t *testService) OnStop() {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	if t.stopped == true {
+		t.multiStopped = true
+	}
+	t.stopped = true
+}
+func (t *testService) OnStart(context.Context) error {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+
+	t.started = true
 	return nil
 }

-func TestBaseServiceWait(t *testing.T) {
+func (t *testService) isStarted() bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.started
+}
+
+func (t *testService) isStopped() bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.stopped
+}
+
+func (t *testService) isMultiStopped() bool {
+	t.mu.Lock()
+	defer t.mu.Unlock()
+	return t.multiStopped
+}
+
+func TestBaseService(t *testing.T) {
+	t.Cleanup(leaktest.Check(t))
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()

-	logger := log.NewTestingLogger(t)
+	logger := log.NewNopLogger()

-	ts := &testService{}
-	ts.BaseService = *NewBaseService(logger, "TestService", ts)
-	err := ts.Start(ctx)
-	require.NoError(t, err)
+	t.Run("Wait", func(t *testing.T) {
+		wctx, wcancel := context.WithCancel(ctx)
+		defer wcancel()
+		ts := &testService{}
+		ts.BaseService = *NewBaseService(logger, t.Name(), ts)
+		err := ts.Start(wctx)
+		require.NoError(t, err)
+		require.True(t, ts.isStarted())

-	waitFinished := make(chan struct{})
-	go func() {
-		ts.Wait()
-		waitFinished <- struct{}{}
-	}()
+		waitFinished := make(chan struct{})
+		wcancel()
+		go func() {
+			ts.Wait()
+			close(waitFinished)
+		}()

-	go cancel()
+		select {
+		case <-waitFinished:
+			assert.True(t, ts.isStopped(), "failed to stop")
+			assert.False(t, ts.IsRunning(), "is not running")
+
+		case <-time.After(100 * time.Millisecond):
+			t.Fatal("expected Wait() to finish within 100 ms.")
+		}
+	})
+	t.Run("ManualStop", func(t *testing.T) {
+		ts := &testService{}
+		ts.BaseService = *NewBaseService(logger, t.Name(), ts)
+		require.False(t, ts.IsRunning())
+		require.False(t, ts.isStarted())
+		require.NoError(t, ts.Start(ctx))
+
+		require.True(t, ts.isStarted())
+
+		require.NoError(t, ts.Stop())
+		require.True(t, ts.isStopped())
+		require.False(t, ts.IsRunning())
+	})
+	t.Run("MultiStop", func(t *testing.T) {
+		t.Run("SingleThreaded", func(t *testing.T) {
+			ts := &testService{}
+			ts.BaseService = *NewBaseService(logger, t.Name(), ts)
+
+			require.NoError(t, ts.Start(ctx))
+			require.True(t, ts.isStarted())
+			require.NoError(t, ts.Stop())
+			require.True(t, ts.isStopped())
+			require.False(t, ts.isMultiStopped())
+			require.Error(t, ts.Stop())
+			require.False(t, ts.isMultiStopped())
+		})
+		t.Run("MultiThreaded", func(t *testing.T) {
+			ctx, cancel := context.WithCancel(context.Background())
+			defer cancel()
+
+			ts := &testService{}
+			ts.BaseService = *NewBaseService(logger, t.Name(), ts)
+
+			require.NoError(t, ts.Start(ctx))
+			require.True(t, ts.isStarted())
+
+			go func() { _ = ts.Stop() }()
+			go cancel()
+
+			ts.Wait()
+
+			require.True(t, ts.isStopped())
+			require.False(t, ts.isMultiStopped())
+		})
+
+	})

-	select {
-	case <-waitFinished:
-		// all good
-	case <-time.After(100 * time.Millisecond):
-		t.Fatal("expected Wait() to finish within 100 ms.")
-	}
 }