libs/service: regularize Stop semantics and concurrency primitives (#7809)

This commit is contained in:
Sam Kleinman
2022-02-14 08:28:29 -05:00
committed by GitHub
parent 73f605af3f
commit 824960c565
9 changed files with 226 additions and 116 deletions

View File

@@ -3,7 +3,7 @@ package service
import (
"context"
"errors"
"sync/atomic"
"sync"
"github.com/tendermint/tendermint/libs/log"
)
@@ -30,9 +30,6 @@ type Service interface {
// Return true if the service is running
IsRunning() bool
// String representation of the service
String() string
// Wait blocks until the service is stopped.
Wait()
}
@@ -40,8 +37,6 @@ type Service interface {
// Implementation describes the implementation that the
// BaseService implementation wraps.
type Implementation interface {
Service
// Called by the Services Start Method
OnStart(context.Context) error
@@ -57,12 +52,7 @@ Users can override the OnStart/OnStop methods. In the absence of errors, these
methods are guaranteed to be called at most once. If OnStart returns an error,
service won't be marked as started, so the user can call Start again.
A call to Reset will panic, unless OnReset is overwritten, allowing
OnStart/OnStop to be called again.
The caller must ensure that Start and Stop are not called concurrently.
It is ok to call Stop without calling Start first.
It is safe, but an error, to call Stop without calling Start first.
Typical usage:
@@ -80,23 +70,21 @@ Typical usage:
}
func (fs *FooService) OnStart(ctx context.Context) error {
fs.BaseService.OnStart() // Always call the overridden method.
// initialize private fields
// start subroutines, etc.
}
func (fs *FooService) OnStop() error {
fs.BaseService.OnStop() // Always call the overridden method.
// close/destroy private fields
// stop subroutines, etc.
}
*/
type BaseService struct {
logger log.Logger
name string
started uint32 // atomic
stopped uint32 // atomic
quit chan struct{}
logger log.Logger
name string
mtx sync.Mutex
quit <-chan (struct{})
cancel context.CancelFunc
// The "subclass" of BaseService
impl Implementation
@@ -107,7 +95,6 @@ func NewBaseService(logger log.Logger, name string, impl Implementation) *BaseSe
return &BaseService{
logger: logger,
name: name,
quit: make(chan struct{}),
impl: impl,
}
}
@@ -116,83 +103,101 @@ func NewBaseService(logger log.Logger, name string, impl Implementation) *BaseSe
// returned if the service is already running or stopped. To restart a
// stopped service, call Reset.
func (bs *BaseService) Start(ctx context.Context) error {
if atomic.CompareAndSwapUint32(&bs.started, 0, 1) {
if atomic.LoadUint32(&bs.stopped) == 1 {
bs.logger.Error("not starting service; already stopped", "service", bs.name, "impl", bs.impl.String())
atomic.StoreUint32(&bs.started, 0)
return ErrAlreadyStopped
}
bs.mtx.Lock()
defer bs.mtx.Unlock()
bs.logger.Info("starting service", "service", bs.name, "impl", bs.impl.String())
if bs.quit != nil {
return ErrAlreadyStarted
}
select {
case <-bs.quit:
return ErrAlreadyStopped
default:
bs.logger.Info("starting service", "service", bs.name, "impl", bs.name)
if err := bs.impl.OnStart(ctx); err != nil {
// revert flag
atomic.StoreUint32(&bs.started, 0)
return err
}
// we need a separate context to ensure that we start
// a thread that will get cleaned up and that the
// Stop/Wait functions work as expected.
srvCtx, cancel := context.WithCancel(context.Background())
bs.cancel = cancel
bs.quit = srvCtx.Done()
go func(ctx context.Context) {
select {
case <-bs.quit:
// someone else explicitly called stop
// and then we shouldn't.
case <-srvCtx.Done():
// this means stop was called manually
return
case <-ctx.Done():
// if nothing is running, no need to
// shut down again.
if !bs.impl.IsRunning() {
return
}
// the context was cancel and we
// should stop.
if err := bs.Stop(); err != nil {
bs.logger.Error("stopped service",
"err", err.Error(),
"service", bs.name,
"impl", bs.impl.String())
}
bs.logger.Info("stopped service",
"service", bs.name,
"impl", bs.impl.String())
_ = bs.Stop()
}
bs.logger.Info("stopped service",
"service", bs.name)
}(ctx)
return nil
}
return ErrAlreadyStarted
}
// Stop implements Service by calling OnStop (if defined) and closing quit
// channel. An error will be returned if the service is already stopped.
func (bs *BaseService) Stop() error {
if atomic.CompareAndSwapUint32(&bs.stopped, 0, 1) {
if atomic.LoadUint32(&bs.started) == 0 {
bs.logger.Error("not stopping service; not started yet", "service", bs.name, "impl", bs.impl.String())
atomic.StoreUint32(&bs.stopped, 0)
return ErrNotStarted
}
bs.mtx.Lock()
defer bs.mtx.Unlock()
bs.logger.Info("stopping service", "service", bs.name, "impl", bs.impl.String())
if bs.quit == nil {
return ErrNotStarted
}
select {
case <-bs.quit:
return ErrAlreadyStopped
default:
bs.logger.Info("stopping service", "service", bs.name)
bs.impl.OnStop()
close(bs.quit)
bs.cancel()
return nil
}
return ErrAlreadyStopped
}
// IsRunning implements Service by returning true or false depending on the
// service's state.
func (bs *BaseService) IsRunning() bool {
return atomic.LoadUint32(&bs.started) == 1 && atomic.LoadUint32(&bs.stopped) == 0
bs.mtx.Lock()
defer bs.mtx.Unlock()
if bs.quit == nil {
return false
}
select {
case <-bs.quit:
return false
default:
return true
}
}
func (bs *BaseService) getWait() <-chan struct{} {
bs.mtx.Lock()
defer bs.mtx.Unlock()
if bs.quit == nil {
out := make(chan struct{})
close(out)
return out
}
return bs.quit
}
// Wait blocks until the service is stopped.
func (bs *BaseService) Wait() { <-bs.quit }
func (bs *BaseService) Wait() { <-bs.getWait() }
// String implements Service by returning a string representation of the service.
func (bs *BaseService) String() string { return bs.name }

View File

@@ -2,45 +2,135 @@ package service
import (
"context"
"sync"
"testing"
"time"
"github.com/fortytw2/leaktest"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/tendermint/tendermint/libs/log"
)
type testService struct {
started bool
stopped bool
multiStopped bool
mu sync.Mutex
BaseService
}
func (testService) OnStop() {}
func (testService) OnStart(context.Context) error {
func (t *testService) OnStop() {
t.mu.Lock()
defer t.mu.Unlock()
if t.stopped == true {
t.multiStopped = true
}
t.stopped = true
}
func (t *testService) OnStart(context.Context) error {
t.mu.Lock()
defer t.mu.Unlock()
t.started = true
return nil
}
func TestBaseServiceWait(t *testing.T) {
func (t *testService) isStarted() bool {
t.mu.Lock()
defer t.mu.Unlock()
return t.started
}
func (t *testService) isStopped() bool {
t.mu.Lock()
defer t.mu.Unlock()
return t.stopped
}
func (t *testService) isMultiStopped() bool {
t.mu.Lock()
defer t.mu.Unlock()
return t.multiStopped
}
func TestBaseService(t *testing.T) {
t.Cleanup(leaktest.Check(t))
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
logger := log.NewTestingLogger(t)
logger := log.NewNopLogger()
ts := &testService{}
ts.BaseService = *NewBaseService(logger, "TestService", ts)
err := ts.Start(ctx)
require.NoError(t, err)
t.Run("Wait", func(t *testing.T) {
wctx, wcancel := context.WithCancel(ctx)
defer wcancel()
ts := &testService{}
ts.BaseService = *NewBaseService(logger, t.Name(), ts)
err := ts.Start(wctx)
require.NoError(t, err)
require.True(t, ts.isStarted())
waitFinished := make(chan struct{})
go func() {
ts.Wait()
waitFinished <- struct{}{}
}()
waitFinished := make(chan struct{})
wcancel()
go func() {
ts.Wait()
close(waitFinished)
}()
go cancel()
select {
case <-waitFinished:
assert.True(t, ts.isStopped(), "failed to stop")
assert.False(t, ts.IsRunning(), "is not running")
case <-time.After(100 * time.Millisecond):
t.Fatal("expected Wait() to finish within 100 ms.")
}
})
t.Run("ManualStop", func(t *testing.T) {
ts := &testService{}
ts.BaseService = *NewBaseService(logger, t.Name(), ts)
require.False(t, ts.IsRunning())
require.False(t, ts.isStarted())
require.NoError(t, ts.Start(ctx))
require.True(t, ts.isStarted())
require.NoError(t, ts.Stop())
require.True(t, ts.isStopped())
require.False(t, ts.IsRunning())
})
t.Run("MultiStop", func(t *testing.T) {
t.Run("SingleThreaded", func(t *testing.T) {
ts := &testService{}
ts.BaseService = *NewBaseService(logger, t.Name(), ts)
require.NoError(t, ts.Start(ctx))
require.True(t, ts.isStarted())
require.NoError(t, ts.Stop())
require.True(t, ts.isStopped())
require.False(t, ts.isMultiStopped())
require.Error(t, ts.Stop())
require.False(t, ts.isMultiStopped())
})
t.Run("MultiThreaded", func(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
ts := &testService{}
ts.BaseService = *NewBaseService(logger, t.Name(), ts)
require.NoError(t, ts.Start(ctx))
require.True(t, ts.isStarted())
go func() { _ = ts.Stop() }()
go cancel()
ts.Wait()
require.True(t, ts.isStopped())
require.False(t, ts.isMultiStopped())
})
})
select {
case <-waitFinished:
// all good
case <-time.After(100 * time.Millisecond):
t.Fatal("expected Wait() to finish within 100 ms.")
}
}