test(s3/policy): stop racing t.TempDir cleanup against mini shutdown

The mini cluster's admin/plugin worker keeps creating files under
admin/plugin/job_types/ for ~1s after subtests finish, while the
previous Stop() only cancelled an unobserved context and slept 500ms.
t.TempDir()'s registered RemoveAll then raced the worker and
intermittently failed with "directory not empty" (CI run 25352039081).

Manage the data dir manually so it is removed only after the mini
goroutine has exited, and wire MiniClusterCtx so cancel actually drains
master/volume/filer/admin/s3/webdav.
This commit is contained in:
Chris Lu
2026-05-04 20:16:53 -07:00
parent 0a91b57f16
commit 6d95a5592a

View File

@@ -710,7 +710,14 @@ func startMiniCluster(t *testing.T) (*TestCluster, error) {
filerPort, filerGrpcPort := ports[4], ports[5]
s3Port, s3GrpcPort := ports[6], ports[7]
testDir := t.TempDir()
// Manually-managed temp dir (not t.TempDir()) so we control removal order:
// the dir is removed inside Stop() AFTER the mini goroutine has fully
// exited. Otherwise t.TempDir()'s RemoveAll cleanup races the admin
// plugin worker, which keeps creating files under admin/plugin/job_types/
// for ~1s after subtests finish, producing flaky "directory not empty"
// failures (see seaweedfs CI run 25352039081).
testDir, err := os.MkdirTemp("", "seaweed-policy-mini-")
require.NoError(t, err)
ctx, cancel := context.WithCancel(context.Background())
s3Endpoint := fmt.Sprintf("http://127.0.0.1:%d", s3Port)
@@ -729,7 +736,7 @@ func startMiniCluster(t *testing.T) (*TestCluster, error) {
// Disable authentication for tests
securityToml := filepath.Join(testDir, "security.toml")
err := os.WriteFile(securityToml, []byte("# Empty security config\n"), 0644)
err = os.WriteFile(securityToml, []byte("# Empty security config\n"), 0644)
require.NoError(t, err)
// Configure credential store for IAM tests.
@@ -783,7 +790,12 @@ enabled = true
for _, cmd := range command.Commands {
if cmd.Name() == "mini" && cmd.Run != nil {
cmd.Flag.Parse(os.Args[1:])
// MiniClusterCtx makes runMini observe our cancel: master/
// volume/filer get this as their shutdownCtx, and the clients
// state (admin/s3/webdav/plugin worker) chains from it.
command.MiniClusterCtx = ctx
cmd.Run(cmd, cmd.Flag.Args())
command.MiniClusterCtx = nil
return
}
}
@@ -792,6 +804,8 @@ enabled = true
// Wait for S3
if !testutil.WaitForService(cluster.s3Endpoint, 60*time.Second) {
cancel()
cluster.wg.Wait()
os.RemoveAll(testDir)
return nil, fmt.Errorf("timeout waiting for S3 at %s", cluster.s3Endpoint)
}
@@ -799,6 +813,8 @@ enabled = true
if os.Getenv("VOLUME_SERVER_IMPL") == "rust" {
if err := cluster.startRustVolumeServer(t); err != nil {
cancel()
cluster.wg.Wait()
os.RemoveAll(testDir)
return nil, fmt.Errorf("failed to start Rust volume server: %v", err)
}
}
@@ -882,10 +898,25 @@ func (c *TestCluster) Stop() {
if c.cancel != nil {
c.cancel()
}
if c.isRunning {
time.Sleep(500 * time.Millisecond)
// Wait for the mini goroutine to fully return before removing the data
// dir. runMini observes MiniClusterCtx and returns once cancel fires; the
// admin/s3/webdav/plugin-worker shutdown is gated by the same ctx via
// resetMiniClients. The deadline is generous because admin shutdown can
// take several seconds when graceful-stops are draining.
done := make(chan struct{})
go func() {
c.wg.Wait()
close(done)
}()
select {
case <-done:
case <-time.After(15 * time.Second):
fmt.Println("Warning: TestCluster.Stop timed out waiting for mini goroutine")
}
// Simplified stop
// Reset all mini flags so a subsequent in-process startMiniCluster sees
// fresh state.
for _, cmd := range command.Commands {
if cmd.Name() == "mini" {
cmd.Flag.VisitAll(func(f *flag.Flag) {
@@ -894,6 +925,10 @@ func (c *TestCluster) Stop() {
break
}
}
if c.dataDir != "" {
os.RemoveAll(c.dataDir)
}
}
func contains(s, substr string) bool {