mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-06-02 06:56:24 +00:00
* fix(storage): keep EC .vif when deleting a coexisting regular volume A regular volume and an EC volume for the same id share <base>.vif. When EC shards are distributed onto a server that still holds the regular volume — the encode source, or any replica the planner targets — the post-encode VolumeDelete ran removeVolumeFiles and stripped the shared .vif, leaving the freshly built EC volume without its info file. Skip the .vif in removeVolumeFiles when an EC volume for the same id exists on the disk (mounted, or a sealed .ecx on disk). The regular volume's .dat/.idx still go; the EC sidecars survive. A two-server end-to-end test encodes a volume whose source and a stub replica both also receive shards, and asserts the final on-disk layout: both .dat/.idx gone, each server holding only its assigned shards plus .ecx/.vif. Storage unit tests cover the with-EC and no-EC cases, and the Rust seaweed-volume port carries the same guard and tests. * test(storage): assert .idx is removed in the no-EC destroy case Strengthen TestDestroyRemovesVifWhenNoEc to confirm the full regular volume cleanup (.dat, .idx, .vif) when no EC volume coexists.
406 lines
13 KiB
Go
406 lines
13 KiB
Go
package storage
|
|
|
|
import (
|
|
"bytes"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"syscall"
|
|
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/backend"
|
|
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
|
|
. "github.com/seaweedfs/seaweedfs/weed/storage/types"
|
|
)
|
|
|
|
var ErrorNotFound = errors.New("not found")
|
|
var ErrorDeleted = errors.New("already deleted")
|
|
var ErrorSizeMismatch = errors.New("size mismatch")
|
|
|
|
// IoErrorTolerance is the number of consecutive EIOs a volume must
|
|
// see before CollectHeartbeat treats the replica as broken. A single
|
|
// transient error is forgiven so a brief NFS / fabric / power blip
|
|
// affecting several replicas at once does not cascade into removal of
|
|
// the last healthy copy.
|
|
const IoErrorTolerance = 3
|
|
|
|
func (v *Volume) checkReadWriteError(err error) {
|
|
if err == nil {
|
|
v.clearIoError()
|
|
return
|
|
}
|
|
if errors.Is(err, syscall.EIO) {
|
|
v.noteIoError(err)
|
|
return
|
|
}
|
|
// non-EIO error breaks the EIO streak — only sustained EIOs should
|
|
// be treated as a failing volume.
|
|
v.clearIoError()
|
|
}
|
|
|
|
// isFileUnchanged checks whether this needle to write is same as last one.
|
|
// It requires serialized access in the same volume.
|
|
func (v *Volume) isFileUnchanged(n *needle.Needle) bool {
|
|
if v.Ttl.String() != "" {
|
|
return false
|
|
}
|
|
|
|
nv, ok := v.nm.Get(n.Id)
|
|
if ok && !nv.Offset.IsZero() && nv.Size.IsValid() {
|
|
oldNeedle := new(needle.Needle)
|
|
err := oldNeedle.ReadData(v.DataBackend, nv.Offset.ToActualOffset(), nv.Size, v.Version())
|
|
if err != nil {
|
|
glog.V(0).Infof("Failed to check updated file at offset %d size %d: %v", nv.Offset.ToActualOffset(), nv.Size, err)
|
|
return false
|
|
}
|
|
if oldNeedle.Cookie == n.Cookie && oldNeedle.Checksum == n.Checksum && bytes.Equal(oldNeedle.Data, n.Data) {
|
|
n.DataSize = oldNeedle.DataSize
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
var ErrVolumeNotEmpty = fmt.Errorf("volume not empty")
|
|
|
|
// Destroy removes everything related to this volume. When keepRemoteData is
|
|
// true the cloud-tier object backing the volume is left intact — used by
|
|
// moves where another server is taking over the same .vif.
|
|
func (v *Volume) Destroy(onlyEmpty bool, keepRemoteData bool) (err error) {
|
|
v.dataFileAccessLock.Lock()
|
|
defer v.dataFileAccessLock.Unlock()
|
|
|
|
if onlyEmpty {
|
|
isEmpty, e := v.doIsEmpty()
|
|
if e != nil {
|
|
err = fmt.Errorf("failed to read isEmpty %v", e)
|
|
return
|
|
}
|
|
if !isEmpty {
|
|
err = ErrVolumeNotEmpty
|
|
return
|
|
}
|
|
}
|
|
if !v.isCompactionInProgress.CompareAndSwap(false, true) {
|
|
err = fmt.Errorf("volume %d is compacting", v.Id)
|
|
return
|
|
}
|
|
close(v.asyncRequestsChan)
|
|
if !keepRemoteData {
|
|
storageName, storageKey := v.RemoteStorageNameKey()
|
|
if v.HasRemoteFile() && storageName != "" && storageKey != "" {
|
|
if backendStorage, found := backend.BackendStorages[storageName]; found {
|
|
backendStorage.DeleteFile(storageKey)
|
|
}
|
|
}
|
|
}
|
|
// A regular volume and an EC volume for the same id share <base>.vif. When
|
|
// EC artefacts coexist on this disk (e.g. shards distributed onto a source
|
|
// replica before it is deleted), keep the .vif so removing the regular
|
|
// volume does not strip the EC volume's info file.
|
|
keepVif := v.sharesVifWithEcVolume()
|
|
v.doClose()
|
|
removeVolumeFiles(v.DataFileName(), keepVif)
|
|
removeVolumeFiles(v.IndexFileName(), keepVif)
|
|
return
|
|
}
|
|
|
|
// sharesVifWithEcVolume reports whether an EC volume for this volume id lives
|
|
// on the same disk, in which case its .vif is the same file as the regular
|
|
// volume's and must outlive the regular volume's deletion.
|
|
func (v *Volume) sharesVifWithEcVolume() bool {
|
|
if v.location == nil {
|
|
return false
|
|
}
|
|
if _, found := v.location.FindEcVolume(v.Id); found {
|
|
return true
|
|
}
|
|
return v.location.HasEcxFileOnDisk(v.Collection, v.Id)
|
|
}
|
|
|
|
func removeVolumeFiles(filename string, keepVif bool) {
|
|
// .dat/.idx removals log at V(0) so destructive calls are traceable.
|
|
deleteAndLog := func(ext string) {
|
|
fullFilename := filename + "." + ext
|
|
st, statErr := os.Stat(fullFilename)
|
|
err := os.RemoveAll(fullFilename)
|
|
if err != nil {
|
|
glog.V(0).Infof("failed to remove volume file %s: %s", fullFilename, err)
|
|
return
|
|
}
|
|
if statErr == nil && (ext == "dat" || ext == "idx") {
|
|
glog.Infof("removed volume file %s (size=%d)", fullFilename, st.Size())
|
|
}
|
|
}
|
|
deleteAndLog("dat")
|
|
deleteAndLog("idx")
|
|
if !keepVif {
|
|
deleteAndLog("vif")
|
|
}
|
|
// sorted index file
|
|
deleteAndLog("sdx")
|
|
// compaction
|
|
deleteAndLog("cpd")
|
|
deleteAndLog("cpx")
|
|
// level db index file
|
|
deleteAndLog("ldb")
|
|
// redb index file (Rust volume server)
|
|
deleteAndLog("rdb")
|
|
// marker for damaged or incomplete volume
|
|
deleteAndLog("note")
|
|
}
|
|
|
|
func (v *Volume) asyncRequestAppend(request *needle.AsyncRequest) {
|
|
v.asyncRequestsChan <- request
|
|
}
|
|
|
|
func (v *Volume) syncWrite(n *needle.Needle, checkCookie bool) (offset uint64, size Size, isUnchanged bool, err error) {
|
|
// glog.V(4).Infof("writing needle %s", needle.NewFileIdFromNeedle(v.Id, n).String())
|
|
v.dataFileAccessLock.Lock()
|
|
defer v.dataFileAccessLock.Unlock()
|
|
|
|
return v.doWriteRequest(n, checkCookie)
|
|
}
|
|
|
|
func (v *Volume) writeNeedle2(n *needle.Needle, checkCookie bool, fsync bool) (offset uint64, size Size, isUnchanged bool, err error) {
|
|
// glog.V(4).Infof("writing needle %s", needle.NewFileIdFromNeedle(v.Id, n).String())
|
|
if n.Ttl == needle.EMPTY_TTL && v.Ttl != needle.EMPTY_TTL {
|
|
n.SetHasTtl()
|
|
n.Ttl = v.Ttl
|
|
}
|
|
|
|
if !fsync {
|
|
return v.syncWrite(n, checkCookie)
|
|
} else {
|
|
asyncRequest := needle.NewAsyncRequest(n, true)
|
|
// using len(n.Data) here instead of n.Size before n.Size is populated in n.Append()
|
|
asyncRequest.ActualSize = needle.GetActualSize(Size(len(n.Data)), v.Version())
|
|
|
|
v.asyncRequestAppend(asyncRequest)
|
|
offset, _, isUnchanged, err = asyncRequest.WaitComplete()
|
|
|
|
return
|
|
}
|
|
}
|
|
|
|
func (v *Volume) doWriteRequest(n *needle.Needle, checkCookie bool) (offset uint64, size Size, isUnchanged bool, err error) {
|
|
// glog.V(4).Infof("writing needle %s", needle.NewFileIdFromNeedle(v.Id, n).String())
|
|
if v.isFileUnchanged(n) {
|
|
size = Size(n.DataSize)
|
|
isUnchanged = true
|
|
return
|
|
}
|
|
|
|
// check whether existing needle cookie matches
|
|
nv, ok := v.nm.Get(n.Id)
|
|
if ok {
|
|
existingNeedle, _, _, existingNeedleReadErr := needle.ReadNeedleHeader(v.DataBackend, v.Version(), nv.Offset.ToActualOffset())
|
|
if existingNeedleReadErr != nil {
|
|
err = fmt.Errorf("reading existing needle: %w", existingNeedleReadErr)
|
|
return
|
|
}
|
|
if n.Cookie == 0 && !checkCookie {
|
|
// this is from batch deletion, and read back again when tailing a remote volume
|
|
// which only happens when checkCookie == false and fsync == false
|
|
n.Cookie = existingNeedle.Cookie
|
|
}
|
|
if existingNeedle.Cookie != n.Cookie {
|
|
glog.V(0).Infof("write cookie mismatch: existing %s, new %s",
|
|
needle.NewFileIdFromNeedle(v.Id, existingNeedle), needle.NewFileIdFromNeedle(v.Id, n))
|
|
err = fmt.Errorf("mismatching cookie %x", n.Cookie)
|
|
return
|
|
}
|
|
}
|
|
|
|
// append to dat file
|
|
n.UpdateAppendAtNs(v.lastAppendAtNs)
|
|
var actualSize int64
|
|
offset, size, actualSize, err = n.Append(v.DataBackend, v.Version())
|
|
v.checkReadWriteError(err)
|
|
if err != nil {
|
|
err = fmt.Errorf("append to volume %d size %d actualSize %d: %v", v.Id, size, actualSize, err)
|
|
return
|
|
}
|
|
v.lastAppendAtNs = n.AppendAtNs
|
|
|
|
// add to needle map
|
|
if !ok || uint64(nv.Offset.ToActualOffset()) < offset {
|
|
if err = v.nm.Put(n.Id, ToOffset(int64(offset)), n.Size); err != nil {
|
|
glog.V(4).Infof("failed to save in needle map %d: %v", n.Id, err)
|
|
}
|
|
}
|
|
if v.lastModifiedTsSeconds < n.LastModified {
|
|
v.lastModifiedTsSeconds = n.LastModified
|
|
}
|
|
return
|
|
}
|
|
|
|
func (v *Volume) syncDelete(n *needle.Needle) (Size, error) {
|
|
// glog.V(4).Infof("delete needle %s", needle.NewFileIdFromNeedle(v.Id, n).String())
|
|
v.dataFileAccessLock.Lock()
|
|
defer v.dataFileAccessLock.Unlock()
|
|
|
|
if v.nm == nil {
|
|
return 0, nil
|
|
}
|
|
|
|
return v.doDeleteRequest(n)
|
|
}
|
|
|
|
func (v *Volume) deleteNeedle2(n *needle.Needle) (Size, error) {
|
|
// todo: delete info is always appended no fsync, it may need fsync in future
|
|
fsync := false
|
|
|
|
if !fsync {
|
|
return v.syncDelete(n)
|
|
} else {
|
|
asyncRequest := needle.NewAsyncRequest(n, false)
|
|
asyncRequest.ActualSize = needle.GetActualSize(0, v.Version())
|
|
|
|
v.asyncRequestAppend(asyncRequest)
|
|
_, size, _, err := asyncRequest.WaitComplete()
|
|
|
|
return Size(size), err
|
|
}
|
|
}
|
|
|
|
func (v *Volume) doDeleteRequest(n *needle.Needle) (Size, error) {
|
|
glog.V(4).Infof("delete needle %s", needle.NewFileIdFromNeedle(v.Id, n).String())
|
|
nv, ok := v.nm.Get(n.Id)
|
|
// fmt.Println("key", n.Id, "volume offset", nv.Offset, "data_size", n.Size, "cached size", nv.Size)
|
|
if ok && !nv.Size.IsDeleted() {
|
|
var offset uint64
|
|
var err error
|
|
size := nv.Size
|
|
if !v.hasRemoteFile {
|
|
n.Data = nil
|
|
n.UpdateAppendAtNs(v.lastAppendAtNs)
|
|
offset, _, _, err = n.Append(v.DataBackend, v.Version())
|
|
v.checkReadWriteError(err)
|
|
if err != nil {
|
|
return size, err
|
|
}
|
|
}
|
|
v.lastAppendAtNs = n.AppendAtNs
|
|
if err = v.nm.Delete(n.Id, ToOffset(int64(offset))); err != nil {
|
|
return size, err
|
|
}
|
|
return size, err
|
|
}
|
|
return 0, nil
|
|
}
|
|
|
|
func (v *Volume) startWorker() {
|
|
go func() {
|
|
chanClosed := false
|
|
for {
|
|
// chan closed. go thread will exit
|
|
if chanClosed {
|
|
break
|
|
}
|
|
currentRequests := make([]*needle.AsyncRequest, 0, 128)
|
|
currentBytesToWrite := int64(0)
|
|
for {
|
|
request, ok := <-v.asyncRequestsChan
|
|
// volume may be closed
|
|
if !ok {
|
|
chanClosed = true
|
|
break
|
|
}
|
|
if MaxPossibleVolumeSize < v.ContentSize()+uint64(currentBytesToWrite+request.ActualSize) {
|
|
request.Complete(0, 0, false,
|
|
fmt.Errorf("volume size limit %d exceeded! current size is %d", MaxPossibleVolumeSize, v.ContentSize()))
|
|
break
|
|
}
|
|
currentRequests = append(currentRequests, request)
|
|
currentBytesToWrite += request.ActualSize
|
|
// submit at most 4M bytes or 128 requests at one time to decrease request delay.
|
|
// it also need to break if there is no data in channel to avoid io hang.
|
|
if currentBytesToWrite >= 4*1024*1024 || len(currentRequests) >= 128 || len(v.asyncRequestsChan) == 0 {
|
|
break
|
|
}
|
|
}
|
|
if len(currentRequests) == 0 {
|
|
continue
|
|
}
|
|
v.dataFileAccessLock.Lock()
|
|
end, _, e := v.DataBackend.GetStat()
|
|
if e != nil {
|
|
for i := 0; i < len(currentRequests); i++ {
|
|
currentRequests[i].Complete(0, 0, false,
|
|
fmt.Errorf("cannot read current volume position: %v", e))
|
|
}
|
|
v.dataFileAccessLock.Unlock()
|
|
continue
|
|
}
|
|
|
|
for i := 0; i < len(currentRequests); i++ {
|
|
if currentRequests[i].IsWriteRequest {
|
|
offset, size, isUnchanged, err := v.doWriteRequest(currentRequests[i].N, true)
|
|
currentRequests[i].UpdateResult(offset, uint64(size), isUnchanged, err)
|
|
} else {
|
|
size, err := v.doDeleteRequest(currentRequests[i].N)
|
|
currentRequests[i].UpdateResult(0, uint64(size), false, err)
|
|
}
|
|
}
|
|
|
|
// if sync error, data is not reliable, we should mark the completed request as fail and rollback
|
|
if err := v.DataBackend.Sync(); err != nil {
|
|
// todo: this may generate dirty data or cause data inconsistent, may be weed need to panic?
|
|
if te := v.DataBackend.Truncate(end); te != nil {
|
|
glog.V(0).Infof("Failed to truncate %s back to %d with error: %v", v.DataBackend.Name(), end, te)
|
|
}
|
|
for i := 0; i < len(currentRequests); i++ {
|
|
if currentRequests[i].IsSucceed() {
|
|
currentRequests[i].UpdateResult(0, 0, false, err)
|
|
}
|
|
}
|
|
}
|
|
|
|
for i := 0; i < len(currentRequests); i++ {
|
|
currentRequests[i].Submit()
|
|
}
|
|
v.dataFileAccessLock.Unlock()
|
|
}
|
|
}()
|
|
}
|
|
|
|
func (v *Volume) WriteNeedleBlob(needleId NeedleId, needleBlob []byte, size Size) error {
|
|
|
|
v.dataFileAccessLock.Lock()
|
|
defer v.dataFileAccessLock.Unlock()
|
|
|
|
if MaxPossibleVolumeSize < v.nm.ContentSize()+uint64(len(needleBlob)) {
|
|
return fmt.Errorf("volume size limit %d exceeded! current size is %d", MaxPossibleVolumeSize, v.nm.ContentSize())
|
|
}
|
|
|
|
nv, ok := v.nm.Get(needleId)
|
|
if ok && nv.Size == size {
|
|
oldNeedle := new(needle.Needle)
|
|
err := oldNeedle.ReadData(v.DataBackend, nv.Offset.ToActualOffset(), nv.Size, v.Version())
|
|
if err == nil {
|
|
newNeedle := new(needle.Needle)
|
|
err = newNeedle.ReadBytes(needleBlob, nv.Offset.ToActualOffset(), size, v.Version())
|
|
if err == nil && oldNeedle.Cookie == newNeedle.Cookie && oldNeedle.Checksum == newNeedle.Checksum && bytes.Equal(oldNeedle.Data, newNeedle.Data) {
|
|
glog.V(0).Infof("needle %v already exists", needleId)
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
appendAtNs := needle.GetAppendAtNs(v.lastAppendAtNs)
|
|
offset, err := needle.WriteNeedleBlob(v.DataBackend, needleBlob, size, appendAtNs, v.Version())
|
|
|
|
v.checkReadWriteError(err)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
v.lastAppendAtNs = appendAtNs
|
|
|
|
// add to needle map
|
|
if err = v.nm.Put(needleId, ToOffset(int64(offset)), size); err != nil {
|
|
glog.V(4).Infof("failed to put in needle map %d: %v", needleId, err)
|
|
}
|
|
|
|
return err
|
|
}
|