Files
seaweedfs/test/volume_server/framework/cluster_multi.go
Chris Lu f5b833ab6a test(ec): end-to-end encode over a multi-server multi-disk stuck layout (#9728)
* test(framework): support multiple disks per server in MultiVolumeCluster

StartMultiVolumeClusterWithDisks gives each volume server N data
directories (one DiskLocation each), passed to -dir as a comma list, with
a per-server disk-dir accessor for file inspection. StartMultiVolumeCluster
keeps its one-disk default.

* test(ec): end-to-end encode over a multi-server multi-disk stuck layout

A volume in the stuck state — real .dat source, a 0-byte stub replica, and
partial stale EC shards from an interrupted encode — must converge to one
valid EC layout. Asserts the full shard set across servers, .ecx/.vif kept
per server (info file survives the source-volume delete), stale shards
cleared, and no regular .dat/.idx left behind.
2026-05-28 16:44:42 -07:00

349 lines
10 KiB
Go

package framework
import (
"fmt"
"net"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
"sync"
"testing"
"github.com/seaweedfs/seaweedfs/test/testutil"
"github.com/seaweedfs/seaweedfs/test/volume_server/matrix"
)
type MultiVolumeCluster struct {
testingTB testing.TB
profile matrix.Profile
weedBinary string
baseDir string
configDir string
logsDir string
keepLogs bool
volumeServerCount int
disksPerServer int
masterPort int
masterGrpcPort int
volumePorts []int
volumeGrpcPorts []int
volumePubPorts []int
volumeDiskDirs [][]string // per server: the data directories (one DiskLocation each)
masterCmd *exec.Cmd
volumeCmds []*exec.Cmd
cleanupOnce sync.Once
}
// StartMultiVolumeCluster starts a cluster with serverCount volume servers, one
// data directory (DiskLocation) each.
func StartMultiVolumeCluster(t testing.TB, profile matrix.Profile, serverCount int) *MultiVolumeCluster {
return StartMultiVolumeClusterWithDisks(t, profile, serverCount, 1)
}
// StartMultiVolumeClusterWithDisks starts serverCount volume servers, each with
// disksPerServer data directories passed to -dir as a comma list so every
// directory becomes its own DiskLocation. Lets tests exercise per-disk EC
// layouts (e.g. shards on one disk, a stub/index on a sibling disk).
func StartMultiVolumeClusterWithDisks(t testing.TB, profile matrix.Profile, serverCount, disksPerServer int) *MultiVolumeCluster {
t.Helper()
if serverCount < 1 {
t.Fatalf("serverCount must be at least 1, got %d", serverCount)
}
if disksPerServer < 1 {
t.Fatalf("disksPerServer must be at least 1, got %d", disksPerServer)
}
weedBinary, err := FindOrBuildWeedBinary()
if err != nil {
t.Fatalf("resolve weed binary: %v", err)
}
baseDir, keepLogs, err := newWorkDir()
if err != nil {
t.Fatalf("create temp test directory: %v", err)
}
configDir := filepath.Join(baseDir, "config")
logsDir := filepath.Join(baseDir, "logs")
masterDataDir := filepath.Join(baseDir, "master")
// Create directories for master and all volume servers. With one disk the
// layout stays at baseDir/volumeN; multi-disk servers get baseDir/volumeN/diskD.
dirs := []string{configDir, logsDir, masterDataDir}
volumeDiskDirs := make([][]string, serverCount)
for i := 0; i < serverCount; i++ {
serverDir := filepath.Join(baseDir, fmt.Sprintf("volume%d", i))
volumeDiskDirs[i] = make([]string, disksPerServer)
for d := 0; d < disksPerServer; d++ {
if disksPerServer == 1 {
volumeDiskDirs[i][d] = serverDir
} else {
volumeDiskDirs[i][d] = filepath.Join(serverDir, fmt.Sprintf("disk%d", d))
}
dirs = append(dirs, volumeDiskDirs[i][d])
}
}
for _, dir := range dirs {
if mkErr := os.MkdirAll(dir, 0o755); mkErr != nil {
t.Fatalf("create %s: %v", dir, mkErr)
}
}
if err = writeSecurityConfig(configDir, profile); err != nil {
t.Fatalf("write security config: %v", err)
}
// Allocate ports for all volume servers (3 ports per server: admin, grpc, public)
// If SplitPublicPort is true, we need an additional port per server
portsPerServer := 3
if profile.SplitPublicPort {
portsPerServer = 4
}
totalPorts := serverCount * portsPerServer
miniPorts, ports, err := testutil.AllocatePortSet(1, totalPorts)
if err != nil {
t.Fatalf("allocate ports: %v", err)
}
masterPort := miniPorts[0]
masterGrpcPort := masterPort + testutil.GrpcPortOffset
c := &MultiVolumeCluster{
testingTB: t,
profile: profile,
weedBinary: weedBinary,
baseDir: baseDir,
configDir: configDir,
logsDir: logsDir,
keepLogs: keepLogs,
volumeServerCount: serverCount,
disksPerServer: disksPerServer,
masterPort: masterPort,
masterGrpcPort: masterGrpcPort,
volumePorts: make([]int, serverCount),
volumeGrpcPorts: make([]int, serverCount),
volumePubPorts: make([]int, serverCount),
volumeDiskDirs: volumeDiskDirs,
volumeCmds: make([]*exec.Cmd, serverCount),
}
// Assign ports to each volume server
for i := 0; i < serverCount; i++ {
baseIdx := i * portsPerServer
c.volumePorts[i] = ports[baseIdx]
c.volumeGrpcPorts[i] = ports[baseIdx+1]
// Assign public port, using baseIdx+3 if SplitPublicPort, else baseIdx+2
pubPortIdx := baseIdx + 2
if profile.SplitPublicPort {
pubPortIdx = baseIdx + 3
}
c.volumePubPorts[i] = ports[pubPortIdx]
}
// Start master
if err = c.startMaster(masterDataDir); err != nil {
c.Stop()
t.Fatalf("start master: %v", err)
}
if err = c.waitForHTTP(c.MasterURL() + "/dir/status"); err != nil {
masterLog := c.tailLog("master.log")
c.Stop()
t.Fatalf("wait for master readiness: %v\nmaster log tail:\n%s", err, masterLog)
}
// Start all volume servers
for i := 0; i < serverCount; i++ {
if err = c.startVolume(i, c.volumeDiskDirs[i]); err != nil {
// Log current server's log for debugging startup failures
volumeLog := fmt.Sprintf("volume%d.log", i)
c.Stop()
t.Fatalf("start volume server %d: %v\nvolume log tail:\n%s", i, err, c.tailLog(volumeLog))
}
if err = c.waitForHTTP(c.VolumeAdminURL(i) + "/status"); err != nil {
volumeLog := fmt.Sprintf("volume%d.log", i)
c.Stop()
t.Fatalf("wait for volume server %d readiness: %v\nvolume log tail:\n%s", i, err, c.tailLog(volumeLog))
}
if err = c.waitForTCP(c.VolumeGRPCAddress(i)); err != nil {
volumeLog := fmt.Sprintf("volume%d.log", i)
c.Stop()
t.Fatalf("wait for volume server %d grpc readiness: %v\nvolume log tail:\n%s", i, err, c.tailLog(volumeLog))
}
}
t.Cleanup(func() {
c.Stop()
})
return c
}
// StartTripleVolumeCluster is a convenience wrapper that starts a cluster with 3 volume servers
func StartTripleVolumeCluster(t testing.TB, profile matrix.Profile) *MultiVolumeCluster {
return StartMultiVolumeCluster(t, profile, 3)
}
func (c *MultiVolumeCluster) Stop() {
if c == nil {
return
}
c.cleanupOnce.Do(func() {
// Stop volume servers in reverse order
for i := len(c.volumeCmds) - 1; i >= 0; i-- {
stopProcess(c.volumeCmds[i])
}
stopProcess(c.masterCmd)
if !c.keepLogs && !c.testingTB.Failed() {
_ = os.RemoveAll(c.baseDir)
} else if c.baseDir != "" {
c.testingTB.Logf("volume server integration logs kept at %s", c.baseDir)
}
})
}
func (c *MultiVolumeCluster) startMaster(dataDir string) error {
logFile, err := os.Create(filepath.Join(c.logsDir, "master.log"))
if err != nil {
return err
}
args := []string{
"-config_dir=" + c.configDir,
"master",
"-ip=127.0.0.1",
"-port=" + strconv.Itoa(c.masterPort),
"-port.grpc=" + strconv.Itoa(c.masterGrpcPort),
"-mdir=" + dataDir,
"-peers=none",
"-volumeSizeLimitMB=" + strconv.Itoa(testVolumeSizeLimitMB),
"-defaultReplication=000",
}
c.masterCmd = exec.Command(c.weedBinary, args...)
c.masterCmd.Dir = c.baseDir
c.masterCmd.Stdout = logFile
c.masterCmd.Stderr = logFile
return c.masterCmd.Start()
}
func (c *MultiVolumeCluster) startVolume(index int, dataDirs []string) error {
logName := fmt.Sprintf("volume%d.log", index)
logFile, err := os.Create(filepath.Join(c.logsDir, logName))
if err != nil {
return err
}
maxPerDir := make([]string, len(dataDirs))
for i := range dataDirs {
maxPerDir[i] = "16"
}
args := []string{
"-config_dir=" + c.configDir,
"volume",
"-ip=127.0.0.1",
"-port=" + strconv.Itoa(c.volumePorts[index]),
"-port.grpc=" + strconv.Itoa(c.volumeGrpcPorts[index]),
"-port.public=" + strconv.Itoa(c.volumePubPorts[index]),
"-dir=" + strings.Join(dataDirs, ","),
"-max=" + strings.Join(maxPerDir, ","),
"-master=127.0.0.1:" + strconv.Itoa(c.masterPort),
"-readMode=" + c.profile.ReadMode,
"-concurrentUploadLimitMB=" + strconv.Itoa(c.profile.ConcurrentUploadLimitMB),
"-concurrentDownloadLimitMB=" + strconv.Itoa(c.profile.ConcurrentDownloadLimitMB),
// Integration tests deliberately exercise loopback S3 endpoints; allow the SSRF guard to be bypassed for them.
"-volume.allowUntrustedRemoteEndpoints",
}
if c.profile.InflightUploadTimeout > 0 {
args = append(args, "-inflightUploadDataTimeout="+c.profile.InflightUploadTimeout.String())
}
if c.profile.InflightDownloadTimeout > 0 {
args = append(args, "-inflightDownloadDataTimeout="+c.profile.InflightDownloadTimeout.String())
}
cmd := exec.Command(c.weedBinary, args...)
cmd.Dir = c.baseDir
cmd.Stdout = logFile
cmd.Stderr = logFile
if err = cmd.Start(); err != nil {
return err
}
c.volumeCmds[index] = cmd
return nil
}
func (c *MultiVolumeCluster) waitForHTTP(url string) error {
return (&Cluster{}).waitForHTTP(url)
}
func (c *MultiVolumeCluster) waitForTCP(addr string) error {
return (&Cluster{}).waitForTCP(addr)
}
func (c *MultiVolumeCluster) tailLog(logName string) string {
return (&Cluster{logsDir: c.logsDir}).tailLog(logName)
}
func (c *MultiVolumeCluster) MasterAddress() string {
return net.JoinHostPort("127.0.0.1", strconv.Itoa(c.masterPort))
}
func (c *MultiVolumeCluster) MasterURL() string {
return "http://" + c.MasterAddress()
}
func (c *MultiVolumeCluster) VolumeAdminAddress(index int) string {
if index < 0 || index >= len(c.volumePorts) {
return ""
}
return net.JoinHostPort("127.0.0.1", strconv.Itoa(c.volumePorts[index]))
}
func (c *MultiVolumeCluster) VolumePublicAddress(index int) string {
if index < 0 || index >= len(c.volumePubPorts) {
return ""
}
return net.JoinHostPort("127.0.0.1", strconv.Itoa(c.volumePubPorts[index]))
}
func (c *MultiVolumeCluster) VolumeGRPCAddress(index int) string {
if index < 0 || index >= len(c.volumeGrpcPorts) {
return ""
}
return net.JoinHostPort("127.0.0.1", strconv.Itoa(c.volumeGrpcPorts[index]))
}
func (c *MultiVolumeCluster) VolumeAdminURL(index int) string {
return "http://" + c.VolumeAdminAddress(index)
}
func (c *MultiVolumeCluster) VolumePublicURL(index int) string {
return "http://" + c.VolumePublicAddress(index)
}
func (c *MultiVolumeCluster) BaseDir() string {
return c.baseDir
}
// VolumeDiskDir returns the data directory backing disk diskIndex on volume
// server serverIndex — i.e. the directory for that DiskLocation, for file
// inspection in tests.
func (c *MultiVolumeCluster) VolumeDiskDir(serverIndex, diskIndex int) string {
if serverIndex < 0 || serverIndex >= len(c.volumeDiskDirs) {
return ""
}
disks := c.volumeDiskDirs[serverIndex]
if diskIndex < 0 || diskIndex >= len(disks) {
return ""
}
return disks[diskIndex]
}