Files
seaweedfs/weed/shell/command_volume_balance.go
Chris Lu e332b97d52 fix(shell): volume.balance no longer drains all volumes onto one server (#9579)
* fix(shell): volume.balance no longer drains all volumes onto one server

The density-based capacity function reads per-disk VolumeInfos sizes, but
adjustAfterMove only updated VolumeCount and the selectedVolumes map. The
planner re-read a stale topology after every move, so the source node's
density never dropped and it kept moving volumes until that node was empty.

Move the volume's size accounting between disks after each planned move so the
density recomputes and the loop converges to an even distribution.

* refactor(shell): O(1) volume removal and direct disk lookup in adjustAfterMove

removeVolumeInfo swaps with the last element instead of shifting, and the disk
is fetched by key rather than ranging the DiskInfos map.
2026-05-20 01:39:23 -07:00

608 lines
20 KiB
Go

package shell
import (
"cmp"
"flag"
"fmt"
"github.com/seaweedfs/seaweedfs/weed/util"
"io"
"os"
"regexp"
"strings"
"time"
"slices"
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/storage/erasure_coding"
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
"github.com/seaweedfs/seaweedfs/weed/storage/types"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
)
const (
thresholdVolumeSize = 1.01
countZeroSelectedVolumes = 0.5
)
func init() {
Commands = append(Commands, &commandVolumeBalance{})
}
type commandVolumeBalance struct {
volumeSizeLimitMb uint64
commandEnv *CommandEnv
volumeByActive *bool
applyBalancing bool
}
func (c *commandVolumeBalance) Name() string {
return "volume.balance"
}
func (c *commandVolumeBalance) Help() string {
return `balance all volumes among volume servers
volume.balance [-collection ALL_COLLECTIONS|EACH_COLLECTION|<collection_name>] [-apply] [-dataCenter=<data_center_name>] [-racks=rack_name_one,rack_name_two] [-nodes=192.168.0.1:8080,192.168.0.2:8080]
The -collection parameter supports:
- ALL_COLLECTIONS: balance across all collections
- EACH_COLLECTION: balance each collection separately
- Regular expressions for pattern matching:
* Use exact match: volume.balance -collection="^mybucket$"
* Match multiple buckets: volume.balance -collection="bucket.*"
* Match all user collections: volume.balance -collection="user-.*"
Algorithm:
For each type of volume server (different max volume count limit){
for each collection {
balanceWritableVolumes()
balanceReadOnlyVolumes()
}
}
func balanceWritableVolumes(){
idealWritableVolumeRatio = totalWritableVolumes / totalNumberOfMaxVolumes
for hasMovedOneVolume {
sort all volume servers ordered by the localWritableVolumeRatio = localWritableVolumes to localVolumeMax
pick the volume server B with the highest localWritableVolumeRatio y
for any the volume server A with the number of writable volumes x + 1 <= idealWritableVolumeRatio * localVolumeMax {
if y > localWritableVolumeRatio {
if B has a writable volume id v that A does not have, and satisfy v replication requirements {
move writable volume v from A to B
}
}
}
}
}
func balanceReadOnlyVolumes(){
//similar to balanceWritableVolumes
}
`
}
func (c *commandVolumeBalance) HasTag(CommandTag) bool {
return false
}
func (c *commandVolumeBalance) Do(args []string, commandEnv *CommandEnv, writer io.Writer) (err error) {
allowedVolumeBy := map[string]*bool{
"ALL": nil,
"ACTIVE": new(bool),
"FULL": new(bool),
}
*allowedVolumeBy["ACTIVE"] = true
balanceCommand := flag.NewFlagSet(c.Name(), flag.ContinueOnError)
verbose := balanceCommand.Bool("v", false, "verbose mode")
collection := balanceCommand.String("collection", "ALL_COLLECTIONS", "collection name, or use \"ALL_COLLECTIONS\" across collections, \"EACH_COLLECTION\" for each collection")
dc := balanceCommand.String("dataCenter", "", "only apply the balancing for this dataCenter")
racks := balanceCommand.String("racks", "", "only apply the balancing for this racks")
nodes := balanceCommand.String("nodes", "", "only apply the balancing for this nodes")
noLock := balanceCommand.Bool("noLock", false, "do not lock the admin shell at one's own risk")
applyBalancing := balanceCommand.Bool("apply", false, "apply the balancing plan.")
// TODO: remove this alias
applyBalancingAlias := balanceCommand.Bool("force", false, "apply the balancing plan (alias for -apply)")
balanceCommand.Func("volumeBy", "only apply the balancing for ALL volumes and ACTIVE or FULL", func(flagValue string) error {
if flagValue == "" {
return nil
}
for allowed, volumeBy := range allowedVolumeBy {
if flagValue == allowed {
c.volumeByActive = volumeBy
return nil
}
}
return fmt.Errorf("use \"ALL\", \"ACTIVE\" or \"FULL\"")
})
if err = balanceCommand.Parse(args); err != nil {
return nil
}
handleDeprecatedForceFlag(writer, balanceCommand, applyBalancingAlias, applyBalancing)
c.applyBalancing = *applyBalancing
infoAboutSimulationMode(writer, c.applyBalancing, "-apply")
if *noLock {
commandEnv.noLock = true
} else {
if err = commandEnv.confirmIsLocked(args); err != nil {
return
}
}
commandEnv.verbose = *verbose
c.commandEnv = commandEnv
// collect topology information
var topologyInfo *master_pb.TopologyInfo
topologyInfo, c.volumeSizeLimitMb, err = collectTopologyInfo(commandEnv, 5*time.Second)
if err != nil {
return err
}
volumeServers := collectVolumeServersByDcRackNode(topologyInfo, *dc, *racks, *nodes)
volumeReplicas, _ := collectVolumeReplicaLocations(topologyInfo)
diskTypes := collectVolumeDiskTypes(topologyInfo)
if *collection == "EACH_COLLECTION" {
collections, err := ListCollectionNames(commandEnv, true, false)
if err != nil {
return err
}
for _, col := range collections {
// Use direct string comparison for exact match (more efficient than regex)
if err = c.balanceVolumeServers(diskTypes, volumeReplicas, volumeServers, nil, col); err != nil {
return err
}
}
} else if *collection == "ALL_COLLECTIONS" {
// Pass nil pattern for all collections
if err = c.balanceVolumeServers(diskTypes, volumeReplicas, volumeServers, nil, *collection); err != nil {
return err
}
} else {
// Compile user-provided pattern
collectionPattern, err := compileCollectionPattern(*collection)
if err != nil {
return fmt.Errorf("invalid collection pattern '%s': %v", *collection, err)
}
if err = c.balanceVolumeServers(diskTypes, volumeReplicas, volumeServers, collectionPattern, *collection); err != nil {
return err
}
}
return nil
}
func (c *commandVolumeBalance) balanceVolumeServers(diskTypes []types.DiskType, volumeReplicas map[uint32][]*VolumeReplica, nodes []*Node, collectionPattern *regexp.Regexp, collectionName string) error {
for _, diskType := range diskTypes {
if err := c.balanceVolumeServersByDiskType(diskType, volumeReplicas, nodes, collectionPattern, collectionName); err != nil {
return err
}
}
return nil
}
func (c *commandVolumeBalance) balanceVolumeServersByDiskType(diskType types.DiskType, volumeReplicas map[uint32][]*VolumeReplica, nodes []*Node, collectionPattern *regexp.Regexp, collectionName string) error {
for _, n := range nodes {
n.selectVolumes(func(v *master_pb.VolumeInformationMessage) bool {
if collectionName != "ALL_COLLECTIONS" {
if collectionPattern != nil {
// Use regex pattern matching
if !collectionPattern.MatchString(v.Collection) {
return false
}
} else {
// Use exact string matching (for EACH_COLLECTION)
if v.Collection != collectionName {
return false
}
}
}
if v.DiskType != string(diskType) {
return false
}
return selectVolumesByActive(v.Size, c.volumeByActive, c.volumeSizeLimitMb)
})
}
if err := balanceSelectedVolume(c.commandEnv, diskType, volumeReplicas, nodes, sortWritableVolumes, c.volumeSizeLimitMb, c.applyBalancing); err != nil {
return err
}
return nil
}
// splitCSVSet parses a comma-separated list into a set for exact-match filtering.
// Whitespace around items is trimmed and empty items are skipped, so callers
// can use len(set) > 0 to test whether any filter was specified.
func splitCSVSet(csv string) map[string]bool {
set := make(map[string]bool)
for _, item := range strings.Split(csv, ",") {
if item = strings.TrimSpace(item); item != "" {
set[item] = true
}
}
return set
}
func collectVolumeServersByDcRackNode(t *master_pb.TopologyInfo, selectedDataCenter string, selectedRacks string, selectedNodes string) (nodes []*Node) {
rackSet := splitCSVSet(selectedRacks)
nodeSet := splitCSVSet(selectedNodes)
for _, dc := range t.DataCenterInfos {
if selectedDataCenter != "" && dc.Id != selectedDataCenter {
continue
}
for _, r := range dc.RackInfos {
if len(rackSet) > 0 && !rackSet[r.Id] {
continue
}
for _, dn := range r.DataNodeInfos {
if len(nodeSet) > 0 && !nodeSet[dn.Id] {
continue
}
nodes = append(nodes, &Node{
info: dn,
dc: dc.Id,
rack: r.Id,
})
}
}
}
return
}
func collectVolumeDiskTypes(t *master_pb.TopologyInfo) (diskTypes []types.DiskType) {
knownTypes := make(map[string]bool)
for _, dc := range t.DataCenterInfos {
for _, r := range dc.RackInfos {
for _, dn := range r.DataNodeInfos {
for diskType := range dn.DiskInfos {
if _, found := knownTypes[diskType]; !found {
knownTypes[diskType] = true
}
}
}
}
}
for diskType := range knownTypes {
diskTypes = append(diskTypes, types.ToDiskType(diskType))
}
return
}
type Node struct {
info *master_pb.DataNodeInfo
selectedVolumes map[uint32]*master_pb.VolumeInformationMessage
dc string
rack string
}
type CapacityFunc func(*master_pb.DataNodeInfo) float64
type DensityFunc func(*master_pb.DataNodeInfo) (float64, uint64)
func capacityByMinVolumeDensity(diskType types.DiskType, volumeSizeLimitMb uint64) DensityFunc {
return func(info *master_pb.DataNodeInfo) (float64, uint64) {
diskInfo, found := info.DiskInfos[string(diskType)]
if !found {
return 0, 0
}
var volumeSizes uint64
for _, volumeInfo := range diskInfo.VolumeInfos {
volumeSizes += volumeInfo.Size
}
if volumeSizeLimitMb == 0 {
volumeSizeLimitMb = util.VolumeSizeLimitGB * util.KiByte
}
usedVolumeCount := volumeSizes / (volumeSizeLimitMb * util.MiByte)
return float64(diskInfo.MaxVolumeCount - int64(usedVolumeCount)), usedVolumeCount
}
}
func capacityByMaxVolumeCount(diskType types.DiskType) CapacityFunc {
return func(info *master_pb.DataNodeInfo) float64 {
diskInfo, found := info.DiskInfos[string(diskType)]
if !found {
return 0
}
var ecShardCount int
for _, ecShardInfo := range diskInfo.EcShardInfos {
ecShardCount += erasure_coding.GetShardCount(ecShardInfo)
}
return float64(diskInfo.MaxVolumeCount) - float64(ecShardCount)/erasure_coding.DataShardsCount
}
}
func capacityByFreeVolumeCount(diskType types.DiskType) CapacityFunc {
return func(info *master_pb.DataNodeInfo) float64 {
diskInfo, found := info.DiskInfos[string(diskType)]
if !found {
return 0
}
var ecShardCount int
for _, ecShardInfo := range diskInfo.EcShardInfos {
ecShardCount += erasure_coding.GetShardCount(ecShardInfo)
}
return float64(diskInfo.MaxVolumeCount-diskInfo.VolumeCount) - float64(ecShardCount)/erasure_coding.DataShardsCount
}
}
func (n *Node) localVolumeDensityRatio(capacityFunc DensityFunc) float64 {
capacity, selectedVolumes := capacityFunc(n.info)
if capacity == 0 {
return 0
}
if selectedVolumes == 0 {
return countZeroSelectedVolumes / capacity
}
return float64(selectedVolumes) / capacity
}
func (n *Node) localVolumeDensityNextRatio(capacityFunc DensityFunc) float64 {
capacity, selectedVolumes := capacityFunc(n.info)
if capacity == 0 {
return 0
}
return float64(selectedVolumes+1) / capacity
}
func (n *Node) localVolumeRatio(capacityFunc CapacityFunc) float64 {
return float64(len(n.selectedVolumes)) / capacityFunc(n.info)
}
func (n *Node) isOneVolumeOnly() bool {
if len(n.selectedVolumes) != 1 {
return false
}
for _, disk := range n.info.DiskInfos {
if disk.VolumeCount == 1 && disk.MaxVolumeCount == 1 {
return true
}
}
return false
}
func (n *Node) selectVolumes(fn func(v *master_pb.VolumeInformationMessage) bool) {
n.selectedVolumes = make(map[uint32]*master_pb.VolumeInformationMessage)
for _, diskInfo := range n.info.DiskInfos {
for _, v := range diskInfo.VolumeInfos {
if fn(v) {
n.selectedVolumes[v.Id] = v
}
}
}
}
func sortWritableVolumes(volumes []*master_pb.VolumeInformationMessage) {
slices.SortFunc(volumes, func(a, b *master_pb.VolumeInformationMessage) int {
return cmp.Compare(a.Size, b.Size)
})
}
func selectVolumesByActive(volumeSize uint64, volumeByActive *bool, volumeSizeLimitMb uint64) bool {
if volumeByActive == nil {
return true
}
if uint64(float64(volumeSize)*thresholdVolumeSize) < volumeSizeLimitMb*util.MiByte {
return *volumeByActive
} else {
return !(*volumeByActive)
}
}
func balanceSelectedVolume(commandEnv *CommandEnv, diskType types.DiskType, volumeReplicas map[uint32][]*VolumeReplica, nodes []*Node, sortCandidatesFn func(volumes []*master_pb.VolumeInformationMessage), volumeSizeLimitMb uint64, applyBalancing bool) (err error) {
selectedVolumeCount, volumeCapacities := uint64(0), float64(0)
var nodesWithCapacity []*Node
if volumeSizeLimitMb == 0 {
volumeSizeLimitMb = util.VolumeSizeLimitGB * util.KiByte
}
capacityFunc := capacityByMinVolumeDensity(diskType, volumeSizeLimitMb)
for _, dn := range nodes {
capacity, volumeCount := capacityFunc(dn.info)
if capacity > 0 {
nodesWithCapacity = append(nodesWithCapacity, dn)
}
volumeCapacities += capacity
selectedVolumeCount += volumeCount
}
if volumeCapacities == 0 {
return nil
}
idealVolumeRatio := float64(selectedVolumeCount) / volumeCapacities
hasMoved := true
if commandEnv != nil && commandEnv.verbose {
fmt.Fprintf(os.Stdout, "selected nodes %d, volumes:%d, cap:%d, idealVolumeRatio %f\n", len(nodesWithCapacity), selectedVolumeCount, int64(volumeCapacities), idealVolumeRatio*100)
}
for hasMoved {
hasMoved = false
slices.SortFunc(nodesWithCapacity, func(a, b *Node) int {
return cmp.Compare(a.localVolumeDensityRatio(capacityFunc), b.localVolumeDensityRatio(capacityFunc))
})
if len(nodesWithCapacity) == 0 {
if commandEnv != nil && commandEnv.verbose {
fmt.Fprintf(os.Stdout, "no volume server found with capacity for %s", diskType.ReadableString())
}
return nil
}
var fullNode *Node
var fullNodeIndex int
for fullNodeIndex = len(nodesWithCapacity) - 1; fullNodeIndex >= 0; fullNodeIndex-- {
fullNode = nodesWithCapacity[fullNodeIndex]
if len(fullNode.selectedVolumes) == 0 {
continue
}
if !fullNode.isOneVolumeOnly() {
break
}
}
var candidateVolumes []*master_pb.VolumeInformationMessage
for _, v := range fullNode.selectedVolumes {
candidateVolumes = append(candidateVolumes, v)
}
if fullNodeIndex == -1 {
if commandEnv != nil && commandEnv.verbose {
fmt.Fprintf(os.Stdout, "no nodes with capacity found for %s, nodes %d", diskType.ReadableString(), len(nodesWithCapacity))
}
return nil
}
sortCandidatesFn(candidateVolumes)
for _, emptyNode := range nodesWithCapacity[:fullNodeIndex] {
if !(fullNode.localVolumeDensityNextRatio(capacityFunc) > idealVolumeRatio && emptyNode.localVolumeDensityNextRatio(capacityFunc) <= idealVolumeRatio) {
if commandEnv != nil && commandEnv.verbose {
fmt.Printf("no more volume servers with empty slots %s, idealVolumeRatio %f\n", emptyNode.info.Id, idealVolumeRatio)
}
break
}
fmt.Fprintf(os.Stdout, "%s %.2f %.2f:%.2f\t", diskType.ReadableString(), idealVolumeRatio,
fullNode.localVolumeDensityRatio(capacityFunc), emptyNode.localVolumeDensityNextRatio(capacityFunc))
if commandEnv != nil && commandEnv.verbose {
fmt.Fprintf(os.Stdout, "%s %.1f %.1f:%.1f\t", diskType.ReadableString(), idealVolumeRatio*100,
fullNode.localVolumeDensityRatio(capacityFunc)*100, emptyNode.localVolumeDensityNextRatio(capacityFunc)*100)
}
hasMoved, err = attemptToMoveOneVolume(commandEnv, volumeReplicas, fullNode, candidateVolumes, emptyNode, applyBalancing)
if err != nil {
if commandEnv != nil && commandEnv.verbose {
fmt.Fprintf(os.Stdout, "attempt to move one volume error %+v\n", err)
}
if strings.Contains(err.Error(), util.ErrVolumeNoSpaceLeft) {
continue
}
return
}
if hasMoved {
// moved one volume
break
}
}
}
return nil
}
func attemptToMoveOneVolume(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, fullNode *Node, candidateVolumes []*master_pb.VolumeInformationMessage, emptyNode *Node, applyBalancing bool) (hasMoved bool, err error) {
for _, v := range candidateVolumes {
hasMoved, err = maybeMoveOneVolume(commandEnv, volumeReplicas, fullNode, v, emptyNode, applyBalancing)
if err != nil {
return
}
if hasMoved {
break
}
}
return
}
func maybeMoveOneVolume(commandEnv *CommandEnv, volumeReplicas map[uint32][]*VolumeReplica, fullNode *Node, candidateVolume *master_pb.VolumeInformationMessage, emptyNode *Node, applyChange bool) (hasMoved bool, err error) {
if !commandEnv.isLocked() {
return false, fmt.Errorf("lock is lost")
}
if candidateVolume.RemoteStorageName != "" {
return false, fmt.Errorf("does not move volume in remote storage")
}
if candidateVolume.ReplicaPlacement > 0 {
replicaPlacement, _ := super_block.NewReplicaPlacementFromByte(byte(candidateVolume.ReplicaPlacement))
if !isGoodMove(replicaPlacement, volumeReplicas[candidateVolume.Id], fullNode, emptyNode) {
return false, nil
}
}
if _, found := emptyNode.selectedVolumes[candidateVolume.Id]; !found {
if err = moveVolume(commandEnv, candidateVolume, fullNode, emptyNode, applyChange); err == nil {
adjustAfterMove(candidateVolume, volumeReplicas, fullNode, emptyNode)
return true, nil
} else {
return
}
}
return
}
func moveVolume(commandEnv *CommandEnv, v *master_pb.VolumeInformationMessage, fullNode *Node, emptyNode *Node, applyChange bool) error {
collectionPrefix := v.Collection + "_"
if v.Collection == "" {
collectionPrefix = ""
}
fmt.Fprintf(os.Stdout, " moving %s volume %s%d %s => %s\n", v.DiskType, collectionPrefix, v.Id, fullNode.info.Id, emptyNode.info.Id)
if applyChange {
return LiveMoveVolume(commandEnv.option.GrpcDialOption, os.Stderr, needle.VolumeId(v.Id), pb.NewServerAddressFromDataNode(fullNode.info), pb.NewServerAddressFromDataNode(emptyNode.info), 5*time.Second, v.DiskType, 0, v.ReadOnly)
}
return nil
}
func isGoodMove(placement *super_block.ReplicaPlacement, existingReplicas []*VolumeReplica, sourceNode, targetNode *Node) bool {
for _, replica := range existingReplicas {
if replica.location.dataNode.Id == targetNode.info.Id &&
replica.location.rack == targetNode.rack &&
replica.location.dc == targetNode.dc {
// never move to existing nodes
return false
}
}
// existing replicas except the one on sourceNode
existingReplicasExceptSourceNode := make([]*VolumeReplica, 0)
for _, replica := range existingReplicas {
if replica.location.dataNode.Id != sourceNode.info.Id {
existingReplicasExceptSourceNode = append(existingReplicasExceptSourceNode, replica)
}
}
// target location
targetLocation := location{
dc: targetNode.dc,
rack: targetNode.rack,
dataNode: targetNode.info,
}
// check if this satisfies replication requirements
return satisfyReplicaPlacement(placement, existingReplicasExceptSourceNode, targetLocation)
}
func removeVolumeInfo(diskInfo *master_pb.DiskInfo, volumeId uint32) {
for i, volumeInfo := range diskInfo.VolumeInfos {
if volumeInfo.Id == volumeId {
// order does not matter here, so swap with the last and truncate
last := len(diskInfo.VolumeInfos) - 1
diskInfo.VolumeInfos[i] = diskInfo.VolumeInfos[last]
diskInfo.VolumeInfos[last] = nil
diskInfo.VolumeInfos = diskInfo.VolumeInfos[:last]
return
}
}
}
func adjustAfterMove(v *master_pb.VolumeInformationMessage, volumeReplicas map[uint32][]*VolumeReplica, fullNode *Node, emptyNode *Node) {
delete(fullNode.selectedVolumes, v.Id)
if emptyNode.selectedVolumes != nil {
emptyNode.selectedVolumes[v.Id] = v
}
existingReplicas := volumeReplicas[v.Id]
for _, replica := range existingReplicas {
if replica.location.dataNode.Id == fullNode.info.Id &&
replica.location.rack == fullNode.rack &&
replica.location.dc == fullNode.dc {
loc := newLocation(emptyNode.dc, emptyNode.rack, emptyNode.info)
replica.location = &loc
// Move the volume's size accounting between disks so that
// capacityByMinVolumeDensity recomputes ratios correctly on the next
// iteration. Without this the density view stays stale and the planner
// keeps draining the same node, moving every volume onto one server.
if fullDisk, found := fullNode.info.DiskInfos[v.DiskType]; found {
removeVolumeInfo(fullDisk, v.Id)
addVolumeCount(fullDisk, -1)
}
if emptyDisk, found := emptyNode.info.DiskInfos[v.DiskType]; found {
emptyDisk.VolumeInfos = append(emptyDisk.VolumeInfos, v)
addVolumeCount(emptyDisk, 1)
}
return
}
}
}