test: fix various E2E test issues (#5576)

* Don't use state sync for nodes starting at initial height. * Also remove stopped containers when cleaning up. * Start nodes in order of startAt, mode, name to avoid full nodes starting before their seeds. * Tweak network waiting to avoid halts caused by validator changes and perturbations. * Disable most tests for seed nodes, which aren't always able to join consensus. * Disable `blockchain/v2` due to known bugs.
2026-01-08 22:23:11 +00:00 · 2020-10-27 17:22:00 +01:00
parent 9d354c842e
commit 59f3f63d33
9 changed files with 88 additions and 21 deletions
--- a/test/e2e/generator/generate.go
+++ b/test/e2e/generator/generate.go
@@ -29,7 +29,10 @@ var (
 	nodeABCIProtocols    = uniformChoice{"unix", "tcp", "grpc", "builtin"}
 	nodePrivvalProtocols = uniformChoice{"file", "unix", "tcp"}
 	// FIXME v1 disabled due to https://github.com/tendermint/tendermint/issues/5444
-	nodeFastSyncs         = uniformChoice{"", "v0", "v2"} // "v1",
+	// FIXME v2 disabled due to:
+	// https://github.com/tendermint/tendermint/issues/5513
+	// https://github.com/tendermint/tendermint/issues/5541
+	nodeFastSyncs         = uniformChoice{"", "v0"} // "v1", "v2"
 	nodeStateSyncs        = uniformChoice{false, true}
 	nodePersistIntervals  = uniformChoice{0, 1, 5}
 	nodeSnapshotIntervals = uniformChoice{0, 3}
@@ -87,7 +90,8 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er

 	// First we generate seed nodes, starting at the initial height.
 	for i := 1; i <= numSeeds; i++ {
-		manifest.Nodes[fmt.Sprintf("seed%02d", i)] = generateNode(r, e2e.ModeSeed, 0, false)
+		manifest.Nodes[fmt.Sprintf("seed%02d", i)] = generateNode(
+			r, e2e.ModeSeed, 0, manifest.InitialHeight, false)
 	}

 	// Next, we generate validators. We make sure a BFT quorum of validators start
@@ -96,15 +100,16 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er
 	nextStartAt := manifest.InitialHeight + 5
 	quorum := numValidators*2/3 + 1
 	for i := 1; i <= numValidators; i++ {
-		startAt := manifest.InitialHeight
+		startAt := int64(0)
 		if i > quorum {
 			startAt = nextStartAt
 			nextStartAt += 5
 		}
 		name := fmt.Sprintf("validator%02d", i)
-		manifest.Nodes[name] = generateNode(r, e2e.ModeValidator, startAt, i <= 2)
+		manifest.Nodes[name] = generateNode(
+			r, e2e.ModeValidator, startAt, manifest.InitialHeight, i <= 2)

-		if startAt == manifest.InitialHeight {
+		if startAt == 0 {
 			(*manifest.Validators)[name] = int64(30 + r.Intn(71))
 		} else {
 			manifest.ValidatorUpdates[fmt.Sprint(startAt+5)] = map[string]int64{
@@ -130,7 +135,8 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er
 			startAt = nextStartAt
 			nextStartAt += 5
 		}
-		manifest.Nodes[fmt.Sprintf("full%02d", i)] = generateNode(r, e2e.ModeFull, startAt, false)
+		manifest.Nodes[fmt.Sprintf("full%02d", i)] = generateNode(
+			r, e2e.ModeFull, startAt, manifest.InitialHeight, false)
 	}

 	// We now set up peer discovery for nodes. Seed nodes are fully meshed with
@@ -180,7 +186,8 @@ func generateTestnet(r *rand.Rand, opt map[string]interface{}) (e2e.Manifest, er
 // here, since we need to know the overall network topology and startup
 // sequencing.
 func generateNode(
-	r *rand.Rand, mode e2e.Mode, startAt int64, forceArchive bool) *e2e.ManifestNode {
+	r *rand.Rand, mode e2e.Mode, startAt int64, initialHeight int64, forceArchive bool,
+) *e2e.ManifestNode {
 	node := e2e.ManifestNode{
 		Mode:             string(mode),
 		StartAt:          startAt,
@@ -203,8 +210,11 @@ func generateNode(
 	}

 	if node.Mode == "validator" {
-		node.Misbehaviors = nodeMisbehaviors.Choose(r).(misbehaviorOption).
-			atHeight(startAt + 5 + int64(r.Intn(10)))
+		misbehaveAt := startAt + 5 + int64(r.Intn(10))
+		if startAt == 0 {
+			misbehaveAt += initialHeight - 1
+		}
+		node.Misbehaviors = nodeMisbehaviors.Choose(r).(misbehaviorOption).atHeight(misbehaveAt)
 		if len(node.Misbehaviors) != 0 {
 			node.PrivvalProtocol = "file"
 		}
--- a/test/e2e/networks/ci.toml
+++ b/test/e2e/networks/ci.toml
@@ -1,4 +1,4 @@
-# This testnet is (will be) run by CI, and attempts to cover a broad range of
+# This testnet is run by CI, and attempts to cover a broad range of
 # functionality with a single network.

 initial_height = 1000
@@ -75,7 +75,7 @@ start_at = 1010
 mode = "full"
 # FIXME Should use v1, but it won't catch up since some nodes don't have all blocks
 # https://github.com/tendermint/tendermint/issues/5444
-fast_sync = "v2"
+fast_sync = "v0"
 persistent_peers = ["validator01", "validator02", "validator03", "validator04", "validator05"]
 retain_blocks = 1
 perturb = ["restart"]
@@ -83,7 +83,10 @@ perturb = ["restart"]
 [node.full02]
 start_at = 1015
 mode = "full"
-fast_sync = "v2"
+# FIXME Should use v2, but it has concurrency bugs causing panics or halts
+# https://github.com/tendermint/tendermint/issues/5513
+# https://github.com/tendermint/tendermint/issues/5541
+fast_sync = "v0"
 state_sync = true
 seeds = ["seed01"]
 perturb = ["restart"]
--- a/test/e2e/pkg/testnet.go
+++ b/test/e2e/pkg/testnet.go
@@ -403,6 +403,16 @@ func (t Testnet) IPv6() bool {
 	return t.IP.IP.To4() == nil
 }

+// HasPerturbations returns whether the network has any perturbations.
+func (t Testnet) HasPerturbations() bool {
+	for _, node := range t.Nodes {
+		if len(node.Perturbations) > 0 {
+			return true
+		}
+	}
+	return false
+}
+
 // LastMisbehaviorHeight returns the height of the last misbehavior.
 func (t Testnet) LastMisbehaviorHeight() int64 {
 	lastHeight := int64(0)
--- a/test/e2e/runner/cleanup.go
+++ b/test/e2e/runner/cleanup.go
@@ -32,7 +32,7 @@ func cleanupDocker() error {
 	xargsR := `$(if [[ $OSTYPE == "linux-gnu"* ]]; then echo -n "-r"; fi)`

 	err := exec("bash", "-c", fmt.Sprintf(
-		"docker container ls -q --filter label=e2e | xargs %v docker container rm -f", xargsR))
+		"docker container ls -qa --filter label=e2e | xargs %v docker container rm -f", xargsR))
 	if err != nil {
 		return err
 	}
--- a/test/e2e/runner/main.go
+++ b/test/e2e/runner/main.go
@@ -69,25 +69,33 @@ func NewCLI() *CLI {
 			if err := Start(cli.testnet); err != nil {
 				return err
 			}
+
 			if lastMisbehavior := cli.testnet.LastMisbehaviorHeight(); lastMisbehavior > 0 {
-				// wait for misbehaviors before starting perturbations
-				if err := WaitUntil(cli.testnet, lastMisbehavior+5); err != nil {
+				// wait for misbehaviors before starting perturbations. We do a separate
+				// wait for another 5 blocks, since the last misbehavior height may be
+				// in the past depending on network startup ordering.
+				if err := WaitUntil(cli.testnet, lastMisbehavior); err != nil {
 					return err
 				}
 			}
-			if err := Perturb(cli.testnet); err != nil {
-				return err
-			}
 			if err := Wait(cli.testnet, 5); err != nil { // allow some txs to go through
 				return err
 			}

+			if cli.testnet.HasPerturbations() {
+				if err := Perturb(cli.testnet); err != nil {
+					return err
+				}
+				if err := Wait(cli.testnet, 5); err != nil { // allow some txs to go through
+					return err
+				}
+			}
+
 			loadCancel()
 			if err := <-chLoadResult; err != nil {
 				return err
 			}
-			// wait for network to settle before tests
-			if err := Wait(cli.testnet, 5); err != nil {
+			if err := Wait(cli.testnet, 5); err != nil { // wait for network to settle before tests
 				return err
 			}
 			if err := Test(cli.testnet); err != nil {
--- a/test/e2e/runner/start.go
+++ b/test/e2e/runner/start.go
@@ -10,8 +10,21 @@ import (

 func Start(testnet *e2e.Testnet) error {

-	// Sort nodes by starting order
+	// Nodes are already sorted by name. Sort them by name then startAt,
+	// which gives the overall order startAt, mode, name.
 	nodeQueue := testnet.Nodes
+	sort.SliceStable(nodeQueue, func(i, j int) bool {
+		a, b := nodeQueue[i], nodeQueue[j]
+		switch {
+		case a.Mode == b.Mode:
+			return false
+		case a.Mode == e2e.ModeSeed:
+			return true
+		case a.Mode == e2e.ModeValidator && b.Mode == e2e.ModeFull:
+			return true
+		}
+		return false
+	})
 	sort.SliceStable(nodeQueue, func(i, j int) bool {
 		return nodeQueue[i].StartAt < nodeQueue[j].StartAt
 	})
--- a/test/e2e/tests/app_test.go
+++ b/test/e2e/tests/app_test.go
@@ -16,6 +16,9 @@ import (
 // Tests that any initial state given in genesis has made it into the app.
 func TestApp_InitialState(t *testing.T) {
 	testNode(t, func(t *testing.T, node e2e.Node) {
+		if node.Mode == e2e.ModeSeed {
+			return
+		}
 		if len(node.Testnet.InitialState) == 0 {
 			return
 		}
@@ -35,6 +38,10 @@ func TestApp_InitialState(t *testing.T) {
 // block and the node sync status.
 func TestApp_Hash(t *testing.T) {
 	testNode(t, func(t *testing.T, node e2e.Node) {
+		if node.Mode == e2e.ModeSeed {
+			return
+		}
+
 		client, err := node.Client()
 		require.NoError(t, err)
 		info, err := client.ABCIInfo(ctx)
@@ -56,6 +63,10 @@ func TestApp_Hash(t *testing.T) {
 // Tests that we can set a value and retrieve it.
 func TestApp_Tx(t *testing.T) {
 	testNode(t, func(t *testing.T, node e2e.Node) {
+		if node.Mode == e2e.ModeSeed {
+			return
+		}
+
 		client, err := node.Client()
 		require.NoError(t, err)

--- a/test/e2e/tests/block_test.go
+++ b/test/e2e/tests/block_test.go
@@ -13,6 +13,10 @@ import (
 func TestBlock_Header(t *testing.T) {
 	blocks := fetchBlockChain(t)
 	testNode(t, func(t *testing.T, node e2e.Node) {
+		if node.Mode == e2e.ModeSeed {
+			return
+		}
+
 		client, err := node.Client()
 		require.NoError(t, err)
 		status, err := client.Status(ctx)
@@ -42,6 +46,10 @@ func TestBlock_Header(t *testing.T) {
 // Tests that the node contains the expected block range.
 func TestBlock_Range(t *testing.T) {
 	testNode(t, func(t *testing.T, node e2e.Node) {
+		if node.Mode == e2e.ModeSeed {
+			return
+		}
+
 		client, err := node.Client()
 		require.NoError(t, err)
 		status, err := client.Status(ctx)
--- a/test/e2e/tests/validator_test.go
+++ b/test/e2e/tests/validator_test.go
@@ -14,6 +14,10 @@ import (
 // scheduled validator updates.
 func TestValidator_Sets(t *testing.T) {
 	testNode(t, func(t *testing.T, node e2e.Node) {
+		if node.Mode == e2e.ModeSeed {
+			return
+		}
+
 		client, err := node.Client()
 		require.NoError(t, err)
 		status, err := client.Status(ctx)