mirror of
https://github.com/tendermint/tendermint.git
synced 2026-01-09 06:33:16 +00:00
blocksync: fix shutdown deadlock issue (#7030)
When shutting down blocksync, it is observed that the process can hang completely. A dump of running goroutines reveals that this is due to goroutines not listening on the correct shutdown signal. Namely, the `poolRoutine` goroutine does not wait on `pool.Quit`. The `poolRoutine` does not receive any other shutdown signal during `OnStop` becuase it must stop before the `r.closeCh` is closed. Currently the `poolRoutine` listens in the `closeCh` which will not close until the `poolRoutine` stops and calls `poolWG.Done()`.
This change also puts the `requestRoutine()` in the `OnStart` method to make it more visible since it does not rely on anything that is spawned in the `poolRoutine`.
```
goroutine 183 [semacquire]:
sync.runtime_Semacquire(0xc0000d3bd8)
runtime/sema.go:56 +0x45
sync.(*WaitGroup).Wait(0xc0000d3bd0)
sync/waitgroup.go:130 +0x65
github.com/tendermint/tendermint/internal/blocksync/v0.(*Reactor).OnStop(0xc0000d3a00)
github.com/tendermint/tendermint/internal/blocksync/v0/reactor.go:193 +0x47
github.com/tendermint/tendermint/libs/service.(*BaseService).Stop(0xc0000d3a00, 0x0, 0x0)
github.com/tendermint/tendermint/libs/service/service.go:171 +0x323
github.com/tendermint/tendermint/node.(*nodeImpl).OnStop(0xc00052c000)
github.com/tendermint/tendermint/node/node.go:758 +0xc62
github.com/tendermint/tendermint/libs/service.(*BaseService).Stop(0xc00052c000, 0x0, 0x0)
github.com/tendermint/tendermint/libs/service/service.go:171 +0x323
github.com/tendermint/tendermint/cmd/tendermint/commands.NewRunNodeCmd.func1.1()
github.com/tendermint/tendermint/cmd/tendermint/commands/run_node.go:143 +0x62
github.com/tendermint/tendermint/libs/os.TrapSignal.func1(0xc000df6d20, 0x7f04a68da900, 0xc0004a8930, 0xc0005a72d8)
github.com/tendermint/tendermint/libs/os/os.go:26 +0x102
created by github.com/tendermint/tendermint/libs/os.TrapSignal
github.com/tendermint/tendermint/libs/os/os.go:22 +0xe6
goroutine 161 [select]:
github.com/tendermint/tendermint/internal/blocksync/v0.(*Reactor).poolRoutine(0xc0000d3a00, 0x0)
github.com/tendermint/tendermint/internal/blocksync/v0/reactor.go:464 +0x2b3
created by github.com/tendermint/tendermint/internal/blocksync/v0.(*Reactor).OnStart
github.com/tendermint/tendermint/internal/blocksync/v0/reactor.go:174 +0xf1
goroutine 162 [select]:
github.com/tendermint/tendermint/internal/blocksync/v0.(*Reactor).processBlockSyncCh(0xc0000d3a00)
github.com/tendermint/tendermint/internal/blocksync/v0/reactor.go:310 +0x151
created by github.com/tendermint/tendermint/internal/blocksync/v0.(*Reactor).OnStart
github.com/tendermint/tendermint/internal/blocksync/v0/reactor.go:177 +0x54
goroutine 163 [select]:
github.com/tendermint/tendermint/internal/blocksync/v0.(*Reactor).processPeerUpdates(0xc0000d3a00)
github.com/tendermint/tendermint/internal/blocksync/v0/reactor.go:363 +0x12b
created by github.com/tendermint/tendermint/internal/blocksync/v0.(*Reactor).OnStart
github.com/tendermint/tendermint/internal/blocksync/v0/reactor.go:178 +0x76
```
This commit is contained in:
@@ -169,6 +169,8 @@ func (r *Reactor) OnStart() error {
|
||||
if err := r.pool.Start(); err != nil {
|
||||
return err
|
||||
}
|
||||
r.poolWG.Add(1)
|
||||
go r.requestRoutine()
|
||||
|
||||
r.poolWG.Add(1)
|
||||
go r.poolRoutine(false)
|
||||
@@ -384,6 +386,9 @@ func (r *Reactor) SwitchToBlockSync(state sm.State) error {
|
||||
|
||||
r.syncStartTime = time.Now()
|
||||
|
||||
r.poolWG.Add(1)
|
||||
go r.requestRoutine()
|
||||
|
||||
r.poolWG.Add(1)
|
||||
go r.poolRoutine(true)
|
||||
|
||||
@@ -394,7 +399,6 @@ func (r *Reactor) requestRoutine() {
|
||||
statusUpdateTicker := time.NewTicker(statusUpdateIntervalSeconds * time.Second)
|
||||
defer statusUpdateTicker.Stop()
|
||||
|
||||
r.poolWG.Add(1)
|
||||
defer r.poolWG.Done()
|
||||
|
||||
for {
|
||||
@@ -455,8 +459,6 @@ func (r *Reactor) poolRoutine(stateSynced bool) {
|
||||
defer trySyncTicker.Stop()
|
||||
defer switchToConsensusTicker.Stop()
|
||||
|
||||
go r.requestRoutine()
|
||||
|
||||
defer r.poolWG.Done()
|
||||
|
||||
FOR_LOOP:
|
||||
@@ -605,6 +607,8 @@ FOR_LOOP:
|
||||
|
||||
case <-r.closeCh:
|
||||
break FOR_LOOP
|
||||
case <-r.pool.Quit():
|
||||
break FOR_LOOP
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user