mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-09 05:13:18 +00:00
Compare commits
37 Commits
greg/block
...
cw/go_rest
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
305361c5ea | ||
|
|
eb244065dc | ||
|
|
2de875e4d8 | ||
|
|
f47dcba80c | ||
|
|
1f345e350d | ||
|
|
ae4b55a147 | ||
|
|
2be15d416d | ||
|
|
c4147a7e8d | ||
|
|
d653c78504 | ||
|
|
0d910eb7ab | ||
|
|
41b1d1180b | ||
|
|
130e10626d | ||
|
|
281cd4f87a | ||
|
|
c78b5cdecc | ||
|
|
6da7034d48 | ||
|
|
60e14e20dc | ||
|
|
5316905d12 | ||
|
|
ea41b198a4 | ||
|
|
027a6ebce6 | ||
|
|
1ac0e5bfd3 | ||
|
|
f6a40de3b0 | ||
|
|
17451841bf | ||
|
|
51f50529fc | ||
|
|
7707d98b54 | ||
|
|
8c195ee4ab | ||
|
|
7b5f59ca53 | ||
|
|
597ce6a4c0 | ||
|
|
afeeb47918 | ||
|
|
660f46a3b4 | ||
|
|
4697424c7c | ||
|
|
11f624926b | ||
|
|
173e0f1edd | ||
|
|
ca57794a00 | ||
|
|
f5f39f4432 | ||
|
|
022e280f0b | ||
|
|
897f26c839 | ||
|
|
25d5b507a1 |
244
restore/cmd/test_restore/test_restore.go
Normal file
244
restore/cmd/test_restore/test_restore.go
Normal file
@@ -0,0 +1,244 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"syscall"
|
||||
|
||||
"restore/pkg/restore"
|
||||
)
|
||||
|
||||
type options struct {
|
||||
metaPath string
|
||||
sourceDir string
|
||||
numWorkers int
|
||||
}
|
||||
|
||||
// hardlinkTracker keeps track of inodes we've already processed
|
||||
type hardlinkTracker struct {
|
||||
sync.Mutex
|
||||
seen map[uint64]bool
|
||||
}
|
||||
|
||||
func newHardlinkTracker() *hardlinkTracker {
|
||||
return &hardlinkTracker{
|
||||
seen: make(map[uint64]bool),
|
||||
}
|
||||
}
|
||||
|
||||
func (h *hardlinkTracker) isNewInode(ino uint64, nlink bool) bool {
|
||||
if !nlink {
|
||||
return true
|
||||
}
|
||||
|
||||
h.Lock()
|
||||
defer h.Unlock()
|
||||
|
||||
if _, exists := h.seen[ino]; exists {
|
||||
return false
|
||||
}
|
||||
|
||||
h.seen[ino] = true
|
||||
return true
|
||||
}
|
||||
|
||||
// getFileInfo extracts file information from os.FileInfo
|
||||
func getFileInfo(info os.FileInfo) restore.FileInfo {
|
||||
stat := info.Sys().(*syscall.Stat_t)
|
||||
|
||||
// Use target inode number if specified, otherwise use actual inode number
|
||||
ino := uint64(stat.Ino)
|
||||
|
||||
return restore.FileInfo{
|
||||
Ino: ino,
|
||||
Mode: uint32(stat.Mode),
|
||||
Uid: uint32(stat.Uid),
|
||||
Gid: uint32(stat.Gid),
|
||||
Size: uint64(stat.Size),
|
||||
Rdev: uint64(stat.Rdev),
|
||||
AtimeSec: stat.Atim.Sec,
|
||||
AtimeNsec: stat.Atim.Nsec,
|
||||
MtimeSec: stat.Mtim.Sec,
|
||||
MtimeNsec: stat.Mtim.Nsec,
|
||||
CtimeSec: stat.Ctim.Sec,
|
||||
CtimeNsec: stat.Ctim.Nsec,
|
||||
IsDir: info.IsDir(),
|
||||
IsRegular: stat.Mode&syscall.S_IFMT == syscall.S_IFREG,
|
||||
}
|
||||
}
|
||||
|
||||
// getXAttrs gets extended attributes for a file/directory
|
||||
func getXAttrs(path string) ([]restore.XAttr, error) {
|
||||
size, err := syscall.Listxattr(path, nil)
|
||||
if err != nil || size == 0 {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
buf := make([]byte, size)
|
||||
size, err = syscall.Listxattr(path, buf)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var xattrs []restore.XAttr
|
||||
start := 0
|
||||
for i := 0; i < size; i++ {
|
||||
if buf[i] == 0 {
|
||||
name := string(buf[start:i])
|
||||
value, err := syscall.Getxattr(path, name, nil)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
valueBuf := make([]byte, value)
|
||||
_, err = syscall.Getxattr(path, name, valueBuf)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
xattrs = append(xattrs, restore.XAttr{
|
||||
Name: name,
|
||||
Value: valueBuf,
|
||||
})
|
||||
start = i + 1
|
||||
}
|
||||
}
|
||||
|
||||
return xattrs, nil
|
||||
}
|
||||
|
||||
func restorePath(writer *restore.WorkerWriter, hlTracker *hardlinkTracker, path string, parentIno uint64) error {
|
||||
entries, err := os.ReadDir(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read directory: %v", err)
|
||||
}
|
||||
log.Printf("Restoring path: %s", path)
|
||||
var subdirs int
|
||||
var nameBytes int
|
||||
|
||||
for pos, entry := range entries {
|
||||
if entry.Name() == "." || entry.Name() == ".." {
|
||||
continue
|
||||
}
|
||||
|
||||
info, err := entry.Info()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get entry info: %v", err)
|
||||
}
|
||||
|
||||
stat, ok := info.Sys().(*syscall.Stat_t)
|
||||
if !ok {
|
||||
return fmt.Errorf("failed to get stat_t")
|
||||
}
|
||||
nameBytes += len(entry.Name())
|
||||
fullPath := filepath.Join(path, entry.Name())
|
||||
|
||||
// Recurse into directories
|
||||
if info.IsDir() {
|
||||
subdirs++
|
||||
|
||||
if err := restorePath(writer, hlTracker, fullPath, uint64(stat.Ino)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
err = writer.CreateEntry(parentIno, uint64(pos), uint64(stat.Ino), uint32(info.Mode()), entry.Name())
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create entry: %v", err)
|
||||
}
|
||||
|
||||
// Handle inode
|
||||
isHardlink := stat.Nlink > 1
|
||||
if !info.IsDir() && hlTracker.isNewInode(uint64(stat.Ino), isHardlink) {
|
||||
fileInfo := getFileInfo(info)
|
||||
err = writer.CreateInode(fileInfo)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create inode: %v", err)
|
||||
}
|
||||
|
||||
// Handle xattrs
|
||||
xattrs, err := getXAttrs(fullPath)
|
||||
if err == nil {
|
||||
for pos, xattr := range xattrs {
|
||||
err = writer.CreateXAttr(uint64(stat.Ino), uint64(pos), xattr)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create xattr: %v", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Get directory info
|
||||
dirInfo, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to stat directory: %v", err)
|
||||
}
|
||||
|
||||
// Create directory inode
|
||||
dirFileInfo := getFileInfo(dirInfo)
|
||||
dirFileInfo.NrSubdirs = uint64(subdirs)
|
||||
dirFileInfo.NameBytes = uint64(nameBytes)
|
||||
|
||||
return writer.CreateInode(dirFileInfo)
|
||||
}
|
||||
|
||||
func main() {
|
||||
opts := options{}
|
||||
flag.StringVar(&opts.metaPath, "m", "", "path to metadata device")
|
||||
flag.StringVar(&opts.sourceDir, "s", "", "path to source directory")
|
||||
flag.IntVar(&opts.numWorkers, "w", 4, "number of worker threads")
|
||||
flag.Parse()
|
||||
|
||||
if opts.metaPath == "" || opts.sourceDir == "" {
|
||||
flag.Usage()
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
// Create master and worker writers
|
||||
master, workers, err := restore.NewWriters(opts.metaPath, opts.numWorkers)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Failed to create writers: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
defer master.Destroy()
|
||||
|
||||
// Create hardlink tracker
|
||||
hlTracker := newHardlinkTracker()
|
||||
|
||||
// Start workers
|
||||
var wg sync.WaitGroup
|
||||
for i, worker := range workers {
|
||||
wg.Add(1)
|
||||
go func(w *restore.WorkerWriter, workerNum int) {
|
||||
defer wg.Done()
|
||||
|
||||
// Each worker processes a subset of the directory tree
|
||||
if err := restorePath(w, hlTracker, opts.sourceDir, 1); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Worker %d failed: %v\n", workerNum, err)
|
||||
os.Exit(1)
|
||||
}
|
||||
// Create root inode for source directory
|
||||
rootInfo, err := os.Stat(opts.sourceDir)
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Failed to stat source directory: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
w.CreateInode(getFileInfo(rootInfo))
|
||||
err = w.Destroy()
|
||||
if err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Failed to destroy worker: %v\n", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}(worker, i)
|
||||
}
|
||||
|
||||
// Wait for all workers to complete
|
||||
wg.Wait()
|
||||
|
||||
fmt.Println("Restore completed successfully")
|
||||
}
|
||||
3
restore/go.mod
Normal file
3
restore/go.mod
Normal file
@@ -0,0 +1,3 @@
|
||||
module restore
|
||||
|
||||
go 1.21.11
|
||||
472
restore/pkg/restore/restore.go
Normal file
472
restore/pkg/restore/restore.go
Normal file
@@ -0,0 +1,472 @@
|
||||
package restore
|
||||
|
||||
/*
|
||||
#cgo CFLAGS: -I${SRCDIR}/../../../utils/src -I${SRCDIR}/../../../kmod/src
|
||||
#cgo LDFLAGS: -L${SRCDIR}/../../../utils/src -l:scoutfs_parallel_restore.a -lm
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <linux/types.h>
|
||||
#include <stdbool.h>
|
||||
#include <math.h>
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "parallel_restore.h"
|
||||
|
||||
// If there are any type conflicts, you might need to add:
|
||||
// #include "kernel_types.h"
|
||||
*/
|
||||
import "C"
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
const batchSize = 1000
|
||||
const bufSize = 2 * 1024 * 1024
|
||||
|
||||
type WorkerWriter struct {
|
||||
writer *C.struct_scoutfs_parallel_restore_writer
|
||||
progressCh chan *ScoutfsParallelWriterProgress
|
||||
fileCreated int64
|
||||
devFd int
|
||||
buf unsafe.Pointer
|
||||
wg *sync.WaitGroup
|
||||
}
|
||||
|
||||
type MasterWriter struct {
|
||||
writer *C.struct_scoutfs_parallel_restore_writer
|
||||
progressCh chan *ScoutfsParallelWriterProgress
|
||||
workers []*WorkerWriter
|
||||
wg sync.WaitGroup
|
||||
slice *C.struct_scoutfs_parallel_restore_slice // Add slice field
|
||||
progressWg sync.WaitGroup
|
||||
devFd int
|
||||
super *C.struct_scoutfs_super_block
|
||||
}
|
||||
|
||||
type ScoutfsParallelWriterProgress struct {
|
||||
Progress *C.struct_scoutfs_parallel_restore_progress
|
||||
Slice *C.struct_scoutfs_parallel_restore_slice
|
||||
}
|
||||
|
||||
func (m *MasterWriter) aggregateProgress() {
|
||||
defer m.progressWg.Done()
|
||||
for progress := range m.progressCh {
|
||||
ret := C.scoutfs_parallel_restore_add_progress(m.writer, progress.Progress)
|
||||
if ret != 0 {
|
||||
// Handle error appropriately, e.g., log it
|
||||
fmt.Printf("Failed to add progress, error code: %d\n", ret)
|
||||
}
|
||||
if progress.Slice != nil {
|
||||
ret = C.scoutfs_parallel_restore_add_slice(m.writer, progress.Slice)
|
||||
C.free(unsafe.Pointer(progress.Slice))
|
||||
if ret != 0 {
|
||||
// Handle error appropriately, e.g., log it
|
||||
fmt.Printf("Failed to add slice, error code: %d\n", ret)
|
||||
}
|
||||
}
|
||||
// Free the C-allocated progress structures
|
||||
C.free(unsafe.Pointer(progress.Progress))
|
||||
}
|
||||
}
|
||||
|
||||
func (m *MasterWriter) Destroy() {
|
||||
m.wg.Wait()
|
||||
close(m.progressCh)
|
||||
m.progressWg.Wait()
|
||||
|
||||
if m.slice != nil {
|
||||
C.free(unsafe.Pointer(m.slice)) // Free slice on error
|
||||
}
|
||||
if m.super != nil {
|
||||
C.free(unsafe.Pointer(m.super)) // Free superblock on error
|
||||
}
|
||||
if m.devFd != 0 {
|
||||
syscall.Close(m.devFd)
|
||||
}
|
||||
// Destroy master writer
|
||||
C.scoutfs_parallel_restore_destroy_writer(&m.writer)
|
||||
}
|
||||
|
||||
func NewWriters(path string, numWriters int) (*MasterWriter, []*WorkerWriter, error) {
|
||||
if numWriters <= 1 {
|
||||
return nil, nil, errors.New("number of writers must be positive")
|
||||
}
|
||||
|
||||
devFd, err := syscall.Open(path, syscall.O_DIRECT|syscall.O_RDWR|syscall.O_EXCL, 0)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("failed to open metadata device '%s': %v", path, err)
|
||||
}
|
||||
|
||||
var masterWriter MasterWriter
|
||||
masterWriter.progressCh = make(chan *ScoutfsParallelWriterProgress, numWriters*2)
|
||||
masterWriter.workers = make([]*WorkerWriter, 0, numWriters-1)
|
||||
masterWriter.devFd = devFd
|
||||
|
||||
var ret C.int
|
||||
// Allocate aligned memory for superblock
|
||||
var super unsafe.Pointer
|
||||
ret = C.posix_memalign(&super, 4096, C.SCOUTFS_BLOCK_SM_SIZE)
|
||||
if ret != 0 {
|
||||
masterWriter.Destroy()
|
||||
return nil, nil, fmt.Errorf("failed to allocate aligned memory for superblock: %d", ret)
|
||||
}
|
||||
masterWriter.super = (*C.struct_scoutfs_super_block)(super)
|
||||
|
||||
// Read the superblock from devFd
|
||||
superOffset := C.SCOUTFS_SUPER_BLKNO << C.SCOUTFS_BLOCK_SM_SHIFT
|
||||
count, err := syscall.Pread(devFd, (*[1 << 30]byte)(super)[:C.SCOUTFS_BLOCK_SM_SIZE], int64(superOffset))
|
||||
if err != nil {
|
||||
masterWriter.Destroy()
|
||||
return nil, nil, fmt.Errorf("failed to read superblock: %v", err)
|
||||
}
|
||||
if count != int(C.SCOUTFS_BLOCK_SM_SIZE) {
|
||||
masterWriter.Destroy()
|
||||
return nil, nil, fmt.Errorf("failed to read superblock, bytes read: %d", count)
|
||||
}
|
||||
|
||||
// Check if the superblock is valid.
|
||||
if C.le64_to_cpu(masterWriter.super.flags)&C.SCOUTFS_FLAG_IS_META_BDEV == 0 {
|
||||
masterWriter.Destroy()
|
||||
return nil, nil, errors.New("superblock is not a metadata device")
|
||||
}
|
||||
|
||||
// Create master writer
|
||||
ret = C.scoutfs_parallel_restore_create_writer(&masterWriter.writer)
|
||||
if ret != 0 {
|
||||
masterWriter.Destroy()
|
||||
return nil, nil, errors.New("failed to create master writer")
|
||||
}
|
||||
|
||||
ret = C.scoutfs_parallel_restore_import_super(masterWriter.writer, masterWriter.super, C.int(devFd))
|
||||
if ret != 0 {
|
||||
masterWriter.Destroy()
|
||||
return nil, nil, fmt.Errorf("failed to import superblock, error code: %d", ret)
|
||||
}
|
||||
|
||||
// Initialize slices for each worker
|
||||
masterWriter.slice = (*C.struct_scoutfs_parallel_restore_slice)(C.malloc(C.size_t(numWriters) *
|
||||
C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_slice{}))))
|
||||
if masterWriter.slice == nil {
|
||||
masterWriter.Destroy()
|
||||
return nil, nil, errors.New("failed to allocate slices")
|
||||
}
|
||||
|
||||
ret = C.scoutfs_parallel_restore_init_slices(masterWriter.writer,
|
||||
masterWriter.slice,
|
||||
C.int(numWriters))
|
||||
if ret != 0 {
|
||||
masterWriter.Destroy()
|
||||
return nil, nil, errors.New("failed to initialize slices")
|
||||
}
|
||||
|
||||
ret = C.scoutfs_parallel_restore_add_slice(masterWriter.writer, masterWriter.slice)
|
||||
if ret != 0 {
|
||||
masterWriter.Destroy()
|
||||
return nil, nil, errors.New("failed to add slice to master writer")
|
||||
}
|
||||
|
||||
// Create worker writers
|
||||
for i := 1; i < numWriters; i++ {
|
||||
var bufPtr unsafe.Pointer
|
||||
if ret := C.posix_memalign(&bufPtr, 4096, bufSize); ret != 0 {
|
||||
masterWriter.Destroy()
|
||||
return nil, nil, fmt.Errorf("failed to allocate aligned worker buffer: %d", ret)
|
||||
}
|
||||
|
||||
worker := &WorkerWriter{
|
||||
progressCh: masterWriter.progressCh,
|
||||
buf: bufPtr,
|
||||
wg: &masterWriter.wg,
|
||||
}
|
||||
ret = C.scoutfs_parallel_restore_create_writer(&worker.writer)
|
||||
if ret != 0 {
|
||||
masterWriter.Destroy()
|
||||
return nil, nil, errors.New("failed to create worker writer")
|
||||
}
|
||||
|
||||
masterWriter.wg.Add(1)
|
||||
|
||||
// Use each slice for the corresponding worker
|
||||
slice := (*C.struct_scoutfs_parallel_restore_slice)(unsafe.Pointer(uintptr(unsafe.Pointer(masterWriter.slice)) +
|
||||
uintptr(i)*unsafe.Sizeof(C.struct_scoutfs_parallel_restore_slice{})))
|
||||
ret = C.scoutfs_parallel_restore_add_slice(worker.writer, slice)
|
||||
if ret != 0 {
|
||||
C.scoutfs_parallel_restore_destroy_writer(&worker.writer)
|
||||
masterWriter.Destroy()
|
||||
return nil, nil, errors.New("failed to add slice to worker writer")
|
||||
}
|
||||
|
||||
masterWriter.workers = append(masterWriter.workers, worker)
|
||||
}
|
||||
masterWriter.progressWg.Add(1)
|
||||
go masterWriter.aggregateProgress()
|
||||
|
||||
return &masterWriter, masterWriter.workers, nil
|
||||
|
||||
}
|
||||
|
||||
func (w *WorkerWriter) getProgress(withSlice bool) (*ScoutfsParallelWriterProgress, error) {
|
||||
progress := (*C.struct_scoutfs_parallel_restore_progress)(
|
||||
C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_progress{}))),
|
||||
)
|
||||
if progress == nil {
|
||||
return nil, errors.New("failed to allocate memory for progress")
|
||||
}
|
||||
|
||||
// Fetch the current progress from the C library
|
||||
ret := C.scoutfs_parallel_restore_get_progress(w.writer, progress)
|
||||
if ret != 0 {
|
||||
C.free(unsafe.Pointer(progress))
|
||||
return nil, fmt.Errorf("failed to get progress, error code: %d", ret)
|
||||
}
|
||||
|
||||
var slice *C.struct_scoutfs_parallel_restore_slice
|
||||
if withSlice {
|
||||
slice = (*C.struct_scoutfs_parallel_restore_slice)(
|
||||
C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_slice{}))),
|
||||
)
|
||||
if slice == nil {
|
||||
C.free(unsafe.Pointer(progress))
|
||||
return nil, errors.New("failed to allocate memory for slice")
|
||||
}
|
||||
|
||||
// Optionally fetch the slice information
|
||||
ret = C.scoutfs_parallel_restore_get_slice(w.writer, slice)
|
||||
if ret != 0 {
|
||||
C.free(unsafe.Pointer(progress))
|
||||
C.free(unsafe.Pointer(slice))
|
||||
return nil, fmt.Errorf("failed to get slice, error code: %d", ret)
|
||||
}
|
||||
}
|
||||
|
||||
return &ScoutfsParallelWriterProgress{
|
||||
Progress: progress,
|
||||
Slice: slice,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// writeBuffer writes data from the buffer to the device file descriptor.
|
||||
// It uses scoutfs_parallel_restore_write_buf to get data and pwrite to write it.
|
||||
func (w *WorkerWriter) writeBuffer() (int64, error) {
|
||||
var totalWritten int64
|
||||
var count int64
|
||||
var off int64
|
||||
var ret C.int
|
||||
|
||||
// Allocate memory for off and count
|
||||
offPtr := (*C.off_t)(unsafe.Pointer(&off))
|
||||
countPtr := (*C.size_t)(unsafe.Pointer(&count))
|
||||
|
||||
for {
|
||||
ret = C.scoutfs_parallel_restore_write_buf(w.writer, w.buf,
|
||||
C.size_t(bufSize), offPtr, countPtr)
|
||||
|
||||
if ret != 0 {
|
||||
return totalWritten, fmt.Errorf("failed to write buffer: error code %d", ret)
|
||||
}
|
||||
|
||||
if count > 0 {
|
||||
n, err := syscall.Pwrite(w.devFd, unsafe.Slice((*byte)(w.buf), count), off)
|
||||
if err != nil {
|
||||
return totalWritten, fmt.Errorf("pwrite failed: %v", err)
|
||||
}
|
||||
if n != int(count) {
|
||||
return totalWritten, fmt.Errorf("pwrite wrote %d bytes; expected %d", n, count)
|
||||
}
|
||||
totalWritten += int64(n)
|
||||
}
|
||||
|
||||
if count == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return totalWritten, nil
|
||||
}
|
||||
|
||||
func (w *WorkerWriter) InsertEntry(entry *C.struct_scoutfs_parallel_restore_entry) error {
|
||||
// Add the entry using the C library
|
||||
ret := C.scoutfs_parallel_restore_add_entry(w.writer, entry)
|
||||
if ret != 0 {
|
||||
return fmt.Errorf("failed to add entry, error code: %d", ret)
|
||||
}
|
||||
|
||||
// Increment the fileCreated counter
|
||||
w.fileCreated++
|
||||
if w.fileCreated >= batchSize {
|
||||
_, err := w.writeBuffer()
|
||||
if err != nil {
|
||||
return fmt.Errorf("error writing buffers: %v", err)
|
||||
}
|
||||
// Allocate memory for progress and slice structures
|
||||
progress, err := w.getProgress(false)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
// Send the progress update to the shared progress channel
|
||||
w.progressCh <- progress
|
||||
// Reset the fileCreated counter
|
||||
w.fileCreated = 0
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (w *WorkerWriter) InsertXattr(xattr *C.struct_scoutfs_parallel_restore_xattr) error {
|
||||
ret := C.scoutfs_parallel_restore_add_xattr(w.writer, xattr)
|
||||
if ret != 0 {
|
||||
return fmt.Errorf("failed to add xattr, error code: %d", ret)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (w *WorkerWriter) InsertInode(inode *C.struct_scoutfs_parallel_restore_inode) error {
|
||||
ret := C.scoutfs_parallel_restore_add_inode(w.writer, inode)
|
||||
if ret != 0 {
|
||||
return fmt.Errorf("failed to add inode, error code: %d", ret)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// should only be called once
|
||||
func (w *WorkerWriter) Destroy() error {
|
||||
defer w.wg.Done()
|
||||
// Send final progress if there are remaining entries
|
||||
if w.fileCreated > 0 {
|
||||
_, err := w.writeBuffer()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
progress := &ScoutfsParallelWriterProgress{
|
||||
Progress: (*C.struct_scoutfs_parallel_restore_progress)(C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_progress{})))),
|
||||
Slice: (*C.struct_scoutfs_parallel_restore_slice)(C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_slice{})))),
|
||||
}
|
||||
w.progressCh <- progress
|
||||
w.fileCreated = 0
|
||||
}
|
||||
|
||||
if w.buf != nil {
|
||||
C.free(w.buf)
|
||||
w.buf = nil
|
||||
}
|
||||
|
||||
C.scoutfs_parallel_restore_destroy_writer(&w.writer)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Add these new types and functions to the existing restore.go file
|
||||
|
||||
type FileInfo struct {
|
||||
Ino uint64
|
||||
Mode uint32
|
||||
Uid uint32
|
||||
Gid uint32
|
||||
Size uint64
|
||||
Rdev uint64
|
||||
AtimeSec int64
|
||||
AtimeNsec int64
|
||||
MtimeSec int64
|
||||
MtimeNsec int64
|
||||
CtimeSec int64
|
||||
CtimeNsec int64
|
||||
NrSubdirs uint64
|
||||
NameBytes uint64
|
||||
IsDir bool
|
||||
IsRegular bool
|
||||
}
|
||||
|
||||
type XAttr struct {
|
||||
Name string
|
||||
Value []byte
|
||||
}
|
||||
|
||||
// CreateInode creates a C inode structure from FileInfo
|
||||
func (w *WorkerWriter) CreateInode(info FileInfo) error {
|
||||
inode := (*C.struct_scoutfs_parallel_restore_inode)(C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_inode{}))))
|
||||
if inode == nil {
|
||||
return fmt.Errorf("failed to allocate inode")
|
||||
}
|
||||
defer C.free(unsafe.Pointer(inode))
|
||||
|
||||
inode.ino = C.__u64(info.Ino)
|
||||
inode.mode = C.__u32(info.Mode)
|
||||
inode.uid = C.__u32(info.Uid)
|
||||
inode.gid = C.__u32(info.Gid)
|
||||
inode.size = C.__u64(info.Size)
|
||||
inode.rdev = C.uint(info.Rdev)
|
||||
|
||||
inode.atime.tv_sec = C.__time_t(info.AtimeSec)
|
||||
inode.atime.tv_nsec = C.long(info.AtimeNsec)
|
||||
inode.mtime.tv_sec = C.__time_t(info.MtimeSec)
|
||||
inode.mtime.tv_nsec = C.long(info.MtimeNsec)
|
||||
inode.ctime.tv_sec = C.__time_t(info.CtimeSec)
|
||||
inode.ctime.tv_nsec = C.long(info.CtimeNsec)
|
||||
inode.crtime = inode.ctime
|
||||
|
||||
if info.IsRegular && info.Size > 0 {
|
||||
inode.offline = C.bool(true)
|
||||
}
|
||||
|
||||
if info.IsDir {
|
||||
inode.nr_subdirs = C.__u64(info.NrSubdirs)
|
||||
inode.total_entry_name_bytes = C.__u64(info.NameBytes)
|
||||
}
|
||||
|
||||
return w.InsertInode(inode)
|
||||
}
|
||||
|
||||
// CreateEntry creates a directory entry
|
||||
func (w *WorkerWriter) CreateEntry(dirIno uint64, pos uint64, ino uint64, mode uint32, name string) error {
|
||||
entryC := (*C.struct_scoutfs_parallel_restore_entry)(C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_entry{})) + C.size_t(len(name))))
|
||||
|
||||
if entryC == nil {
|
||||
return fmt.Errorf("failed to allocate entry")
|
||||
}
|
||||
defer C.free(unsafe.Pointer(entryC))
|
||||
|
||||
entryC.dir_ino = C.__u64(dirIno)
|
||||
entryC.pos = C.__u64(pos)
|
||||
entryC.ino = C.__u64(ino)
|
||||
entryC.mode = C.__u32(mode)
|
||||
entryC.name_len = C.uint(len(name))
|
||||
|
||||
entryC.name = (*C.char)(C.malloc(C.size_t(len(name))))
|
||||
if entryC.name == nil {
|
||||
return fmt.Errorf("failed to allocate entry name")
|
||||
}
|
||||
defer C.free(unsafe.Pointer(entryC.name))
|
||||
copy((*[1 << 30]byte)(unsafe.Pointer(entryC.name))[:len(name)], []byte(name))
|
||||
|
||||
return w.InsertEntry(entryC)
|
||||
}
|
||||
|
||||
// CreateXAttr creates an extended attribute
|
||||
func (w *WorkerWriter) CreateXAttr(ino uint64, pos uint64, xattr XAttr) error {
|
||||
xattrC := (*C.struct_scoutfs_parallel_restore_xattr)(C.malloc(C.size_t(unsafe.Sizeof(C.struct_scoutfs_parallel_restore_xattr{})) + C.size_t(len(xattr.Name)) + C.size_t(len(xattr.Value))))
|
||||
if xattrC == nil {
|
||||
return fmt.Errorf("failed to allocate xattr")
|
||||
}
|
||||
defer C.free(unsafe.Pointer(xattrC))
|
||||
|
||||
xattrC.ino = C.__u64(ino)
|
||||
xattrC.pos = C.__u64(pos)
|
||||
xattrC.name_len = C.uint(len(xattr.Name))
|
||||
xattrC.value_len = C.__u32(len(xattr.Value))
|
||||
|
||||
xattrC.name = (*C.char)(C.malloc(C.size_t(len(xattr.Name))))
|
||||
if xattrC.name == nil {
|
||||
return fmt.Errorf("failed to allocate xattr name")
|
||||
}
|
||||
defer C.free(unsafe.Pointer(xattrC.name))
|
||||
|
||||
copy((*[1 << 30]byte)(unsafe.Pointer(xattrC.name))[:len(xattr.Name)], []byte(xattr.Name))
|
||||
|
||||
xattrC.value = unsafe.Pointer(&xattr.Value[0])
|
||||
|
||||
return w.InsertXattr(xattrC)
|
||||
}
|
||||
10
restore/pkg/restore/restore_test.go
Normal file
10
restore/pkg/restore/restore_test.go
Normal file
@@ -0,0 +1,10 @@
|
||||
package restore
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestNewWriters(t *testing.T) {
|
||||
_, _, err := NewWriters("/tmp", 2)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create master writer: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -13,7 +13,9 @@ BIN := src/createmany \
|
||||
src/create_xattr_loop \
|
||||
src/fragmented_data_extents \
|
||||
src/o_tmpfile_umask \
|
||||
src/o_tmpfile_linkat
|
||||
src/o_tmpfile_linkat \
|
||||
src/parallel_restore \
|
||||
src/restore_copy
|
||||
|
||||
DEPS := $(wildcard src/*.d)
|
||||
|
||||
@@ -23,8 +25,12 @@ ifneq ($(DEPS),)
|
||||
-include $(DEPS)
|
||||
endif
|
||||
|
||||
src/parallel_restore_cflags := ../utils/src/scoutfs_parallel_restore.a -lm
|
||||
src/restore_copy_cflags := ../utils/src/scoutfs_parallel_restore.a -lm
|
||||
|
||||
$(BIN): %: %.c Makefile
|
||||
gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@
|
||||
gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $($(@)_cflags)
|
||||
|
||||
|
||||
.PHONY: clean
|
||||
clean:
|
||||
|
||||
28
tests/golden/parallel_restore
Normal file
28
tests/golden/parallel_restore
Normal file
@@ -0,0 +1,28 @@
|
||||
== simple mkfs/restore/mount
|
||||
committed_seq 1120
|
||||
total_meta_blocks 163840
|
||||
total_data_blocks 15728640
|
||||
1440 1440 57120
|
||||
80 80 400
|
||||
0: offset: 0 length: 1 flags: O.L
|
||||
extents: 1
|
||||
0: offset: 0 length: 1 flags: O.L
|
||||
extents: 1
|
||||
0: offset: 0 length: 1 flags: O.L
|
||||
extents: 1
|
||||
0: offset: 0 length: 1 flags: O.L
|
||||
extents: 1
|
||||
Type Size Total Used Free Use%
|
||||
MetaData 64KB 163840 34722 129118 21
|
||||
Data 4KB 15728640 64 15728576 0
|
||||
7 13,L,- 15,L,- 17,L,- I 33 -
|
||||
== just under ENOSPC
|
||||
Type Size Total Used Free Use%
|
||||
MetaData 64KB 163840 155666 8174 95
|
||||
Data 4KB 15728640 64 15728576 0
|
||||
== just over ENOSPC
|
||||
== ENOSPC
|
||||
== attempt to restore data device
|
||||
== attempt format_v1 restore
|
||||
== test if previously mounted
|
||||
== cleanup
|
||||
@@ -54,4 +54,5 @@ archive-light-cycle.sh
|
||||
block-stale-reads.sh
|
||||
inode-deletion.sh
|
||||
renameat2-noreplace.sh
|
||||
parallel_restore.sh
|
||||
xfstests.sh
|
||||
|
||||
838
tests/src/parallel_restore.c
Normal file
838
tests/src/parallel_restore.c
Normal file
@@ -0,0 +1,838 @@
|
||||
#define _GNU_SOURCE /* O_DIRECT */
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdbool.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/xattr.h>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <time.h>
|
||||
#include <sys/prctl.h>
|
||||
#include <signal.h>
|
||||
#include <sys/socket.h>
|
||||
|
||||
#include "../../utils/src/sparse.h"
|
||||
#include "../../utils/src/util.h"
|
||||
#include "../../utils/src/list.h"
|
||||
#include "../../utils/src/parse.h"
|
||||
#include "../../kmod/src/format.h"
|
||||
#include "../../utils/src/parallel_restore.h"
|
||||
|
||||
/*
|
||||
* XXX:
|
||||
* - add a nice description of what's going on
|
||||
* - mention allocator contention
|
||||
* - test child process dying handling
|
||||
* - root dir entry name length is wrong
|
||||
*/
|
||||
|
||||
#define ERRF " errno %d (%s)"
|
||||
#define ERRA errno, strerror(errno)
|
||||
|
||||
#define error_exit(cond, fmt, args...) \
|
||||
do { \
|
||||
if (cond) { \
|
||||
printf("error: "fmt"\n", ##args); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define dprintf(fmt, args...) \
|
||||
do { \
|
||||
if (0) \
|
||||
printf(fmt, ##args); \
|
||||
} while (0)
|
||||
|
||||
#define REG_MODE (S_IFREG | 0644)
|
||||
#define DIR_MODE (S_IFDIR | 0755)
|
||||
|
||||
struct opts {
|
||||
unsigned long long buf_size;
|
||||
|
||||
unsigned long long write_batch;
|
||||
unsigned long long low_dirs;
|
||||
unsigned long long high_dirs;
|
||||
unsigned long long low_files;
|
||||
unsigned long long high_files;
|
||||
char *meta_path;
|
||||
unsigned long long total_files;
|
||||
bool read_only;
|
||||
unsigned long long seed;
|
||||
unsigned long long nr_writers;
|
||||
};
|
||||
|
||||
static void usage(void)
|
||||
{
|
||||
printf("usage:\n"
|
||||
" -b NR | threads write blocks in batches files (100000)\n"
|
||||
" -d LOW:HIGH | range of subdirs per directory (5:10)\n"
|
||||
" -f LOW:HIGH | range of files per directory (10:20)\n"
|
||||
" -m PATH | path to metadata device\n"
|
||||
" -n NR | total number of files to create (100)\n"
|
||||
" -r | read-only, all work except writing, measure cpu cost\n"
|
||||
" -s NR | randomization seed (random)\n"
|
||||
" -w NR | number of writing processes to fork (online cpus)\n"
|
||||
);
|
||||
}
|
||||
|
||||
static size_t write_bufs(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
|
||||
void *buf, size_t buf_size, int dev_fd)
|
||||
{
|
||||
size_t total = 0;
|
||||
size_t count;
|
||||
off_t off;
|
||||
int ret;
|
||||
|
||||
do {
|
||||
ret = scoutfs_parallel_restore_write_buf(wri, buf, buf_size, &off, &count);
|
||||
error_exit(ret, "write buf %d", ret);
|
||||
|
||||
if (count > 0) {
|
||||
if (!opts->read_only)
|
||||
ret = pwrite(dev_fd, buf, count, off);
|
||||
else
|
||||
ret = count;
|
||||
error_exit(ret != count, "pwrite count %zu ret %d", count, ret);
|
||||
total += ret;
|
||||
}
|
||||
} while (count > 0);
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
struct gen_inode {
|
||||
struct scoutfs_parallel_restore_inode inode;
|
||||
struct scoutfs_parallel_restore_xattr **xattrs;
|
||||
u64 nr_xattrs;
|
||||
struct scoutfs_parallel_restore_entry **entries;
|
||||
u64 nr_files;
|
||||
u64 nr_entries;
|
||||
};
|
||||
|
||||
static void free_gino(struct gen_inode *gino)
|
||||
{
|
||||
u64 i;
|
||||
|
||||
if (gino) {
|
||||
if (gino->entries) {
|
||||
for (i = 0; i < gino->nr_entries; i++)
|
||||
free(gino->entries[i]);
|
||||
free(gino->entries);
|
||||
}
|
||||
if (gino->xattrs) {
|
||||
for (i = 0; i < gino->nr_xattrs; i++)
|
||||
free(gino->xattrs[i]);
|
||||
free(gino->xattrs);
|
||||
}
|
||||
free(gino);
|
||||
}
|
||||
}
|
||||
|
||||
static struct scoutfs_parallel_restore_xattr *
|
||||
generate_xattr(struct opts *opts, u64 ino, u64 pos, char *name, int name_len, void *value,
|
||||
int value_len)
|
||||
{
|
||||
struct scoutfs_parallel_restore_xattr *xattr;
|
||||
|
||||
xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + name_len + value_len);
|
||||
error_exit(!xattr, "error allocating generated xattr");
|
||||
|
||||
*xattr = (struct scoutfs_parallel_restore_xattr) {
|
||||
.ino = ino,
|
||||
.pos = pos,
|
||||
.name_len = name_len,
|
||||
.value_len = value_len,
|
||||
};
|
||||
|
||||
xattr->name = (void *)(xattr + 1);
|
||||
xattr->value = (void *)(xattr->name + name_len);
|
||||
|
||||
memcpy(xattr->name, name, name_len);
|
||||
if (value_len)
|
||||
memcpy(xattr->value, value, value_len);
|
||||
|
||||
return xattr;
|
||||
}
|
||||
|
||||
static struct gen_inode *generate_inode(struct opts *opts, u64 ino, mode_t mode)
|
||||
{
|
||||
struct gen_inode *gino;
|
||||
struct timespec now;
|
||||
|
||||
clock_gettime(CLOCK_REALTIME, &now);
|
||||
|
||||
gino = calloc(1, sizeof(struct gen_inode));
|
||||
error_exit(!gino, "failure allocating generated inode");
|
||||
|
||||
gino->inode = (struct scoutfs_parallel_restore_inode) {
|
||||
.ino = ino,
|
||||
.meta_seq = ino,
|
||||
.data_seq = 0,
|
||||
.mode = mode,
|
||||
.atime = now,
|
||||
.ctime = now,
|
||||
.mtime = now,
|
||||
.crtime = now,
|
||||
};
|
||||
|
||||
/*
|
||||
* hacky creation of a bunch of xattrs for now.
|
||||
*/
|
||||
if ((mode & S_IFMT) == S_IFREG) {
|
||||
#define NV(n, v) { n, sizeof(n) - 1, v, sizeof(v) - 1, }
|
||||
struct name_val {
|
||||
char *name;
|
||||
int len;
|
||||
char *value;
|
||||
int value_len;
|
||||
} nv[] = {
|
||||
NV("scoutfs.hide.totl.acct.8314611887310466424.2.0", "1"),
|
||||
NV("scoutfs.hide.srch.sam_vol_E01001L6_4", ""),
|
||||
NV("scoutfs.hide.sam_reqcopies", ""),
|
||||
NV("scoutfs.hide.sam_copy_2", ""),
|
||||
NV("scoutfs.hide.totl.acct.F01030L6.8314611887310466424.7.30", "1"),
|
||||
NV("scoutfs.hide.sam_copy_1", ""),
|
||||
NV("scoutfs.hide.srch.sam_vol_F01030L6_4", ""),
|
||||
NV("scoutfs.hide.srch.sam_release_cand", ""),
|
||||
NV("scoutfs.hide.sam_restime", ""),
|
||||
NV("scoutfs.hide.sam_uuid", ""),
|
||||
NV("scoutfs.hide.totl.acct.8314611887310466424.3.0", "1"),
|
||||
NV("scoutfs.hide.srch.sam_vol_F01030L6", ""),
|
||||
NV("scoutfs.hide.srch.sam_uuid_865939b7-24d6-472f-b85c-7ce7afeb813a", ""),
|
||||
NV("scoutfs.hide.srch.sam_vol_E01001L6", ""),
|
||||
NV("scoutfs.hide.totl.acct.E01001L6.8314611887310466424.7.1", "1"),
|
||||
NV("scoutfs.hide.totl.acct.8314611887310466424.4.0", "1"),
|
||||
NV("scoutfs.hide.totl.acct.8314611887310466424.11.0", "1"),
|
||||
NV("scoutfs.hide.totl.acct.8314611887310466424.1.0", "1"),
|
||||
};
|
||||
unsigned int nr = array_size(nv);
|
||||
int i;
|
||||
|
||||
gino->xattrs = calloc(nr, sizeof(struct scoutfs_parallel_restore_xattr *));
|
||||
|
||||
for (i = 0; i < nr; i++)
|
||||
gino->xattrs[i] = generate_xattr(opts, ino, i, nv[i].name, nv[i].len,
|
||||
nv[i].value, nv[i].value_len);
|
||||
|
||||
gino->nr_xattrs = nr;
|
||||
gino->inode.nr_xattrs = nr;
|
||||
|
||||
gino->inode.size = 4096;
|
||||
gino->inode.offline = true;
|
||||
}
|
||||
|
||||
return gino;
|
||||
}
|
||||
|
||||
static struct scoutfs_parallel_restore_entry *
|
||||
generate_entry(struct opts *opts, char *prefix, u64 nr, u64 dir_ino, u64 pos, u64 ino, mode_t mode)
|
||||
{
|
||||
struct scoutfs_parallel_restore_entry *entry;
|
||||
char buf[PATH_MAX];
|
||||
int bytes;
|
||||
|
||||
bytes = snprintf(buf, sizeof(buf), "%s-%llu", prefix, nr);
|
||||
|
||||
entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + bytes);
|
||||
error_exit(!entry, "error allocating generated entry");
|
||||
|
||||
*entry = (struct scoutfs_parallel_restore_entry) {
|
||||
.dir_ino = dir_ino,
|
||||
.pos = pos,
|
||||
.ino = ino,
|
||||
.mode = mode,
|
||||
.name = (void *)(entry + 1),
|
||||
.name_len = bytes,
|
||||
};
|
||||
|
||||
memcpy(entry->name, buf, bytes);
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
/*
|
||||
* since the _parallel_restore_quota_rule mimics the squota_rule found in the
|
||||
* kernel we can also mimic its rule_to_irule function
|
||||
*/
|
||||
|
||||
#define TEST_RULE_STR "7 13,L,- 15,L,- 17,L,- I 33 -"
|
||||
|
||||
static struct scoutfs_parallel_restore_quota_rule *
|
||||
generate_quota(struct opts *opts)
|
||||
{
|
||||
struct scoutfs_parallel_restore_quota_rule *prule;
|
||||
int err;
|
||||
|
||||
prule = calloc(1, sizeof(struct scoutfs_parallel_restore_quota_rule));
|
||||
error_exit(!prule, "Quota rule alloc failed");
|
||||
|
||||
err = sscanf(TEST_RULE_STR, " %hhu %llu,%c,%c %llu,%c,%c %llu,%c,%c %c %llu %c",
|
||||
&prule->prio,
|
||||
&prule->names[0].val, &prule->names[0].source, &prule->names[0].flags,
|
||||
&prule->names[1].val, &prule->names[1].source, &prule->names[1].flags,
|
||||
&prule->names[2].val, &prule->names[2].source, &prule->names[2].flags,
|
||||
&prule->op, &prule->limit, &prule->rule_flags);
|
||||
error_exit(err != 13, "invalid quota rule, missing fields. nr fields: %d rule str: %s\n", err, TEST_RULE_STR);
|
||||
|
||||
return prule;
|
||||
}
|
||||
|
||||
static u64 random64(void)
|
||||
{
|
||||
return ((u64)lrand48() << 32) | lrand48();
|
||||
}
|
||||
|
||||
static u64 random_range(u64 low, u64 high)
|
||||
{
|
||||
return low + (random64() % (high - low + 1));
|
||||
}
|
||||
|
||||
static struct gen_inode *generate_dir(struct opts *opts, u64 dir_ino, u64 ino_start, u64 ino_len,
|
||||
bool no_dirs)
|
||||
{
|
||||
struct scoutfs_parallel_restore_entry *entry;
|
||||
struct gen_inode *gino;
|
||||
u64 nr_entries;
|
||||
u64 nr_files;
|
||||
u64 nr_dirs;
|
||||
u64 ino;
|
||||
char *prefix;
|
||||
mode_t mode;
|
||||
u64 i;
|
||||
|
||||
nr_dirs = no_dirs ? 0 : random_range(opts->low_dirs, opts->high_dirs);
|
||||
nr_files = random_range(opts->low_files, opts->high_files);
|
||||
|
||||
if (1 + nr_dirs + nr_files > ino_len) {
|
||||
nr_dirs = no_dirs ? 0 : (ino_len - 1) / 2;
|
||||
nr_files = (ino_len - 1) - nr_dirs;
|
||||
}
|
||||
|
||||
nr_entries = nr_dirs + nr_files;
|
||||
|
||||
gino = generate_inode(opts, dir_ino, DIR_MODE);
|
||||
error_exit(!gino, "error allocating generated inode");
|
||||
|
||||
gino->inode.nr_subdirs = nr_dirs;
|
||||
gino->nr_files = nr_files;
|
||||
|
||||
if (nr_entries) {
|
||||
gino->entries = calloc(nr_entries, sizeof(struct scoutfs_parallel_restore_entry *));
|
||||
error_exit(!gino->entries, "error allocating generated inode entries");
|
||||
|
||||
gino->nr_entries = nr_entries;
|
||||
}
|
||||
|
||||
mode = DIR_MODE;
|
||||
prefix = "dir";
|
||||
for (i = 0; i < nr_entries; i++) {
|
||||
if (i == nr_dirs) {
|
||||
mode = REG_MODE;
|
||||
prefix = "file";
|
||||
}
|
||||
|
||||
ino = ino_start + i;
|
||||
entry = generate_entry(opts, prefix, ino, gino->inode.ino,
|
||||
SCOUTFS_DIRENT_FIRST_POS + i, ino, mode);
|
||||
|
||||
gino->entries[i] = entry;
|
||||
gino->inode.total_entry_name_bytes += entry->name_len;
|
||||
}
|
||||
|
||||
return gino;
|
||||
}
|
||||
|
||||
/*
|
||||
* Restore a generated inode. If it's a directory then we also restore
|
||||
* all its entries. The caller is going to descend into subdir entries and generate
|
||||
* those dir inodes. We have to generate and restore all non-dir inodes referenced
|
||||
* by this inode's entries.
|
||||
*/
|
||||
static void restore_inode(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
|
||||
struct gen_inode *gino)
|
||||
{
|
||||
struct gen_inode *nondir;
|
||||
int ret;
|
||||
u64 i;
|
||||
|
||||
ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
|
||||
error_exit(ret, "thread add root inode %d", ret);
|
||||
|
||||
for (i = 0; i < gino->nr_entries; i++) {
|
||||
ret = scoutfs_parallel_restore_add_entry(wri, gino->entries[i]);
|
||||
error_exit(ret, "thread add entry %d", ret);
|
||||
|
||||
/* caller only needs subdir entries, generate and free others */
|
||||
if ((gino->entries[i]->mode & S_IFMT) != S_IFDIR) {
|
||||
|
||||
nondir = generate_inode(opts, gino->entries[i]->ino,
|
||||
gino->entries[i]->mode);
|
||||
restore_inode(opts, wri, nondir);
|
||||
free_gino(nondir);
|
||||
|
||||
free(gino->entries[i]);
|
||||
if (i != gino->nr_entries - 1)
|
||||
gino->entries[i] = gino->entries[gino->nr_entries - 1];
|
||||
gino->nr_entries--;
|
||||
gino->nr_files--;
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < gino->nr_xattrs; i++) {
|
||||
ret = scoutfs_parallel_restore_add_xattr(wri, gino->xattrs[i]);
|
||||
error_exit(ret, "thread add xattr %d", ret);
|
||||
}
|
||||
}
|
||||
|
||||
struct writer_args {
|
||||
struct list_head head;
|
||||
|
||||
int dev_fd;
|
||||
int pair_fd;
|
||||
|
||||
struct scoutfs_parallel_restore_slice slice;
|
||||
u64 writer_nr;
|
||||
u64 dir_height;
|
||||
u64 ino_start;
|
||||
u64 ino_len;
|
||||
};
|
||||
|
||||
struct write_result {
|
||||
struct scoutfs_parallel_restore_progress prog;
|
||||
struct scoutfs_parallel_restore_slice slice;
|
||||
__le64 files_created;
|
||||
__le64 bytes_written;
|
||||
};
|
||||
|
||||
static void write_bufs_and_send(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
|
||||
void *buf, size_t buf_size, int dev_fd,
|
||||
struct write_result *res, bool get_slice, int pair_fd)
|
||||
{
|
||||
size_t total;
|
||||
int ret;
|
||||
|
||||
total = write_bufs(opts, wri, buf, buf_size, dev_fd);
|
||||
le64_add_cpu(&res->bytes_written, total);
|
||||
|
||||
ret = scoutfs_parallel_restore_get_progress(wri, &res->prog);
|
||||
error_exit(ret, "get prog %d", ret);
|
||||
|
||||
if (get_slice) {
|
||||
ret = scoutfs_parallel_restore_get_slice(wri, &res->slice);
|
||||
error_exit(ret, "thread get slice %d", ret);
|
||||
}
|
||||
|
||||
ret = write(pair_fd, res, sizeof(struct write_result));
|
||||
error_exit(ret != sizeof(struct write_result), "result send error");
|
||||
|
||||
memset(res, 0, sizeof(struct write_result));
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate the number of bytes in toplevel "dir-%llu" entry names for the given
|
||||
* number of writers.
|
||||
*/
|
||||
static u64 topdir_entry_bytes(u64 nr_writers)
|
||||
{
|
||||
u64 bytes = (3 + 1) * nr_writers;
|
||||
u64 limit;
|
||||
u64 done;
|
||||
u64 wid;
|
||||
u64 nr;
|
||||
|
||||
for (done = 0, wid = 1, limit = 10; done < nr_writers; done += nr, wid++, limit *= 10) {
|
||||
nr = min(limit - done, nr_writers - done);
|
||||
bytes += nr * wid;
|
||||
}
|
||||
|
||||
return bytes;
|
||||
}
|
||||
|
||||
struct dir_pos {
|
||||
struct gen_inode *gino;
|
||||
u64 pos;
|
||||
};
|
||||
|
||||
static void writer_proc(struct opts *opts, struct writer_args *args)
|
||||
{
|
||||
struct scoutfs_parallel_restore_writer *wri = NULL;
|
||||
struct scoutfs_parallel_restore_entry *entry;
|
||||
struct dir_pos *dirs = NULL;
|
||||
struct write_result res;
|
||||
struct gen_inode *gino;
|
||||
void *buf = NULL;
|
||||
u64 level;
|
||||
u64 ino;
|
||||
int ret;
|
||||
|
||||
memset(&res, 0, sizeof(res));
|
||||
|
||||
dirs = calloc(args->dir_height, sizeof(struct dir_pos));
|
||||
error_exit(errno, "error allocating parent dirs "ERRF, ERRA);
|
||||
|
||||
errno = posix_memalign((void **)&buf, 4096, opts->buf_size);
|
||||
error_exit(errno, "error allocating block buf "ERRF, ERRA);
|
||||
|
||||
ret = scoutfs_parallel_restore_create_writer(&wri);
|
||||
error_exit(ret, "create writer %d", ret);
|
||||
|
||||
ret = scoutfs_parallel_restore_add_slice(wri, &args->slice);
|
||||
error_exit(ret, "add slice %d", ret);
|
||||
|
||||
/* writer 0 creates the root dir */
|
||||
if (args->writer_nr == 0) {
|
||||
gino = generate_inode(opts, SCOUTFS_ROOT_INO, DIR_MODE);
|
||||
gino->inode.nr_subdirs = opts->nr_writers;
|
||||
gino->inode.total_entry_name_bytes = topdir_entry_bytes(opts->nr_writers);
|
||||
|
||||
ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
|
||||
error_exit(ret, "thread add root inode %d", ret);
|
||||
free_gino(gino);
|
||||
}
|
||||
|
||||
/* create root entry for our top level dir */
|
||||
ino = args->ino_start++;
|
||||
args->ino_len--;
|
||||
|
||||
entry = generate_entry(opts, "top", args->writer_nr,
|
||||
SCOUTFS_ROOT_INO, SCOUTFS_DIRENT_FIRST_POS + args->writer_nr,
|
||||
ino, DIR_MODE);
|
||||
|
||||
ret = scoutfs_parallel_restore_add_entry(wri, entry);
|
||||
error_exit(ret, "thread top entry %d", ret);
|
||||
free(entry);
|
||||
|
||||
level = args->dir_height - 1;
|
||||
|
||||
while (args->ino_len > 0 && level < args->dir_height) {
|
||||
gino = dirs[level].gino;
|
||||
|
||||
/* generate and restore if we follow entries */
|
||||
if (!gino) {
|
||||
gino = generate_dir(opts, ino, args->ino_start, args->ino_len, level == 0);
|
||||
args->ino_start += gino->nr_entries;
|
||||
args->ino_len -= gino->nr_entries;
|
||||
le64_add_cpu(&res.files_created, gino->nr_files);
|
||||
|
||||
restore_inode(opts, wri, gino);
|
||||
dirs[level].gino = gino;
|
||||
}
|
||||
|
||||
if (dirs[level].pos == gino->nr_entries) {
|
||||
/* ascend if we're done with this dir */
|
||||
dirs[level].gino = NULL;
|
||||
dirs[level].pos = 0;
|
||||
free_gino(gino);
|
||||
level++;
|
||||
|
||||
} else {
|
||||
/* otherwise descend into subdir entry */
|
||||
ino = gino->entries[dirs[level].pos]->ino;
|
||||
dirs[level].pos++;
|
||||
level--;
|
||||
}
|
||||
|
||||
/* do a partial write at batch intervals when there's still more to do */
|
||||
if (le64_to_cpu(res.files_created) >= opts->write_batch && args->ino_len > 0)
|
||||
write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
|
||||
&res, false, args->pair_fd);
|
||||
}
|
||||
|
||||
write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
|
||||
&res, true, args->pair_fd);
|
||||
|
||||
scoutfs_parallel_restore_destroy_writer(&wri);
|
||||
|
||||
free(dirs);
|
||||
free(buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* If any of our children exited with an error code, we hard exit.
|
||||
* The child processes should themselves report out any errors
|
||||
* encountered. Any remaining children will receive SIGHUP and
|
||||
* terminate.
|
||||
*/
|
||||
static void sigchld_handler(int signo, siginfo_t *info, void *context)
|
||||
{
|
||||
if (info->si_status)
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
static void fork_writer(struct opts *opts, struct writer_args *args)
|
||||
{
|
||||
pid_t parent = getpid();
|
||||
pid_t pid;
|
||||
int ret;
|
||||
|
||||
pid = fork();
|
||||
error_exit(pid == -1, "fork error");
|
||||
|
||||
if (pid != 0)
|
||||
return;
|
||||
|
||||
ret = prctl(PR_SET_PDEATHSIG, SIGHUP);
|
||||
error_exit(ret < 0, "failed to set parent death sig");
|
||||
|
||||
printf("pid %u getpid() %u parent %u getppid() %u\n",
|
||||
pid, getpid(), parent, getppid());
|
||||
error_exit(getppid() != parent, "child parent already changed");
|
||||
|
||||
writer_proc(opts, args);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
static int do_restore(struct opts *opts)
|
||||
{
|
||||
struct scoutfs_parallel_restore_writer *wri = NULL;
|
||||
struct scoutfs_parallel_restore_slice *slices = NULL;
|
||||
struct scoutfs_parallel_restore_quota_rule *rule = NULL;
|
||||
struct scoutfs_super_block *super = NULL;
|
||||
struct write_result res;
|
||||
struct writer_args *args;
|
||||
struct timespec begin;
|
||||
struct timespec end;
|
||||
LIST_HEAD(writers);
|
||||
u64 next_ino;
|
||||
u64 ino_per;
|
||||
u64 avg_dirs;
|
||||
u64 avg_files;
|
||||
u64 dir_height;
|
||||
u64 tot_files;
|
||||
u64 tot_bytes;
|
||||
int pair[2] = {-1, -1};
|
||||
float secs;
|
||||
void *buf = NULL;
|
||||
int dev_fd = -1;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
|
||||
error_exit(ret, "socketpair error "ERRF, ERRA);
|
||||
|
||||
dev_fd = open(opts->meta_path, O_DIRECT | (opts->read_only ? O_RDONLY : (O_RDWR|O_EXCL)));
|
||||
error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA);
|
||||
|
||||
errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?:
|
||||
posix_memalign((void **)&buf, 4096, opts->buf_size);
|
||||
error_exit(errno, "error allocating block bufs "ERRF, ERRA);
|
||||
|
||||
ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
|
||||
SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
|
||||
error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret);
|
||||
|
||||
ret = scoutfs_parallel_restore_create_writer(&wri);
|
||||
error_exit(ret, "create writer %d", ret);
|
||||
|
||||
ret = scoutfs_parallel_restore_import_super(wri, super, dev_fd);
|
||||
error_exit(ret, "import super %d", ret);
|
||||
|
||||
rule = generate_quota(opts);
|
||||
ret = scoutfs_parallel_restore_add_quota_rule(wri, rule);
|
||||
free(rule);
|
||||
error_exit(ret, "add quotas %d", ret);
|
||||
|
||||
slices = calloc(1 + opts->nr_writers, sizeof(struct scoutfs_parallel_restore_slice));
|
||||
error_exit(!slices, "alloc slices");
|
||||
|
||||
scoutfs_parallel_restore_init_slices(wri, slices, 1 + opts->nr_writers);
|
||||
|
||||
ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]);
|
||||
error_exit(ret, "add slices[0] %d", ret);
|
||||
|
||||
next_ino = (SCOUTFS_ROOT_INO | SCOUTFS_LOCK_INODE_GROUP_MASK) + 1;
|
||||
ino_per = opts->total_files / opts->nr_writers;
|
||||
avg_dirs = (opts->low_dirs + opts->high_dirs) / 2;
|
||||
avg_files = (opts->low_files + opts->high_files) / 2;
|
||||
|
||||
dir_height = 1;
|
||||
tot_files = avg_files * opts->nr_writers;
|
||||
|
||||
while (tot_files < opts->total_files) {
|
||||
dir_height++;
|
||||
tot_files *= avg_dirs;
|
||||
}
|
||||
|
||||
dprintf("height %llu tot %llu total %llu\n", dir_height, tot_files, opts->total_files);
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC_RAW, &begin);
|
||||
|
||||
/* start each writing process */
|
||||
for (i = 0; i < opts->nr_writers; i++) {
|
||||
args = calloc(1, sizeof(struct writer_args));
|
||||
error_exit(!args, "alloc writer args");
|
||||
|
||||
args->dev_fd = dev_fd;
|
||||
args->pair_fd = pair[1];
|
||||
args->slice = slices[1 + i];
|
||||
args->writer_nr = i;
|
||||
args->dir_height = dir_height;
|
||||
args->ino_start = next_ino;
|
||||
args->ino_len = ino_per;
|
||||
|
||||
list_add_tail(&args->head, &writers);
|
||||
next_ino += ino_per;
|
||||
|
||||
fork_writer(opts, args);
|
||||
}
|
||||
|
||||
/* read results and watch for writers to finish */
|
||||
tot_files = 0;
|
||||
tot_bytes = 0;
|
||||
i = 0;
|
||||
while (i < opts->nr_writers) {
|
||||
ret = read(pair[0], &res, sizeof(struct write_result));
|
||||
error_exit(ret != sizeof(struct write_result), "result read error %d", ret);
|
||||
|
||||
ret = scoutfs_parallel_restore_add_progress(wri, &res.prog);
|
||||
error_exit(ret, "add thr prog %d", ret);
|
||||
|
||||
if (res.slice.meta_len != 0) {
|
||||
ret = scoutfs_parallel_restore_add_slice(wri, &res.slice);
|
||||
error_exit(ret, "add thr slice %d", ret);
|
||||
i++;
|
||||
}
|
||||
|
||||
tot_files += le64_to_cpu(res.files_created);
|
||||
tot_bytes += le64_to_cpu(res.bytes_written);
|
||||
}
|
||||
|
||||
tot_bytes += write_bufs(opts, wri, buf, opts->buf_size, dev_fd);
|
||||
|
||||
ret = scoutfs_parallel_restore_export_super(wri, super);
|
||||
error_exit(ret, "update super %d", ret);
|
||||
|
||||
if (!opts->read_only) {
|
||||
ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
|
||||
SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
|
||||
error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret);
|
||||
}
|
||||
|
||||
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
|
||||
|
||||
scoutfs_parallel_restore_destroy_writer(&wri);
|
||||
|
||||
secs = ((float)end.tv_sec + ((float)end.tv_nsec/NSEC_PER_SEC)) -
|
||||
((float)begin.tv_sec + ((float)begin.tv_nsec/NSEC_PER_SEC));
|
||||
printf("created %llu files in %llu bytes and %f secs => %f bytes/file, %f files/sec\n",
|
||||
tot_files, tot_bytes, secs,
|
||||
(float)tot_bytes / tot_files, (float)tot_files / secs);
|
||||
|
||||
if (dev_fd >= 0)
|
||||
close(dev_fd);
|
||||
if (pair[0] >= 0)
|
||||
close(pair[0]);
|
||||
if (pair[1] >= 0)
|
||||
close(pair[1]);
|
||||
free(super);
|
||||
free(slices);
|
||||
free(buf);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int parse_low_high(char *str, u64 *low_ret, u64 *high_ret)
|
||||
{
|
||||
char *sep;
|
||||
int ret = 0;
|
||||
|
||||
sep = index(str, ':');
|
||||
if (sep) {
|
||||
*sep = '\0';
|
||||
ret = parse_u64(sep + 1, high_ret);
|
||||
}
|
||||
|
||||
if (ret == 0)
|
||||
ret = parse_u64(str, low_ret);
|
||||
|
||||
if (sep)
|
||||
*sep = ':';
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct opts opts = {
|
||||
.buf_size = (32 * 1024 * 1024),
|
||||
|
||||
.write_batch = 1000000,
|
||||
.low_dirs = 5,
|
||||
.high_dirs = 10,
|
||||
.low_files = 10,
|
||||
.high_files = 20,
|
||||
.total_files = 100,
|
||||
};
|
||||
struct sigaction act = { 0 };
|
||||
int ret;
|
||||
int c;
|
||||
|
||||
opts.seed = random64();
|
||||
opts.nr_writers = sysconf(_SC_NPROCESSORS_ONLN);
|
||||
|
||||
while ((c = getopt(argc, argv, "b:d:f:m:n:rs:w:")) != -1) {
|
||||
switch(c) {
|
||||
case 'b':
|
||||
ret = parse_u64(optarg, &opts.write_batch);
|
||||
error_exit(ret, "error parsing -b '%s'\n", optarg);
|
||||
error_exit(opts.write_batch == 0, "-b can't be 0");
|
||||
break;
|
||||
case 'd':
|
||||
ret = parse_low_high(optarg, &opts.low_dirs, &opts.high_dirs);
|
||||
error_exit(ret, "error parsing -d '%s'\n", optarg);
|
||||
break;
|
||||
case 'f':
|
||||
ret = parse_low_high(optarg, &opts.low_files, &opts.high_files);
|
||||
error_exit(ret, "error parsing -f '%s'\n", optarg);
|
||||
break;
|
||||
case 'm':
|
||||
opts.meta_path = strdup(optarg);
|
||||
break;
|
||||
case 'n':
|
||||
ret = parse_u64(optarg, &opts.total_files);
|
||||
error_exit(ret, "error parsing -n '%s'\n", optarg);
|
||||
break;
|
||||
case 'r':
|
||||
opts.read_only = true;
|
||||
break;
|
||||
case 's':
|
||||
ret = parse_u64(optarg, &opts.seed);
|
||||
error_exit(ret, "error parsing -s '%s'\n", optarg);
|
||||
break;
|
||||
case 'w':
|
||||
ret = parse_u64(optarg, &opts.nr_writers);
|
||||
error_exit(ret, "error parsing -w '%s'\n", optarg);
|
||||
break;
|
||||
case '?':
|
||||
printf("Unknown option '%c'\n", optopt);
|
||||
usage();
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
error_exit(opts.low_dirs > opts.high_dirs, "LOW > HIGH in -d %llu:%llu",
|
||||
opts.low_dirs, opts.high_dirs);
|
||||
error_exit(opts.low_files > opts.high_files, "LOW > HIGH in -f %llu:%llu",
|
||||
opts.low_files, opts.high_files);
|
||||
error_exit(!opts.meta_path, "must specify metadata device path with -m");
|
||||
|
||||
printf("recreate with: -d %llu:%llu -f %llu:%llu -n %llu -s %llu -w %llu\n",
|
||||
opts.low_dirs, opts.high_dirs, opts.low_files, opts.high_files,
|
||||
opts.total_files, opts.seed, opts.nr_writers);
|
||||
|
||||
act.sa_flags = SA_SIGINFO | SA_RESTART;
|
||||
act.sa_sigaction = &sigchld_handler;
|
||||
if (sigaction(SIGCHLD, &act, NULL) == -1)
|
||||
error_exit(ret, "error setting up signal handler\n");
|
||||
|
||||
ret = do_restore(&opts);
|
||||
|
||||
free(opts.meta_path);
|
||||
|
||||
return ret == 0 ? 0 : 1;
|
||||
}
|
||||
817
tests/src/restore_copy.c
Normal file
817
tests/src/restore_copy.c
Normal file
@@ -0,0 +1,817 @@
|
||||
#define _GNU_SOURCE /* O_DIRECT */
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stdbool.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/xattr.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <time.h>
|
||||
#include <sys/prctl.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/signal.h>
|
||||
#include <sys/statfs.h>
|
||||
#include <dirent.h>
|
||||
|
||||
#include "../../utils/src/sparse.h"
|
||||
#include "../../utils/src/util.h"
|
||||
#include "../../utils/src/list.h"
|
||||
#include "../../utils/src/parse.h"
|
||||
#include "../../kmod/src/format.h"
|
||||
#include "../../kmod/src/ioctl.h"
|
||||
#include "../../utils/src/parallel_restore.h"
|
||||
|
||||
/*
|
||||
* XXX:
|
||||
*/
|
||||
|
||||
#define ERRF " errno %d (%s)"
|
||||
#define ERRA errno, strerror(errno)
|
||||
|
||||
#define error_exit(cond, fmt, args...) \
|
||||
do { \
|
||||
if (cond) { \
|
||||
printf("error: "fmt"\n", ##args); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define REG_MODE (S_IFREG | 0644)
|
||||
#define DIR_MODE (S_IFDIR | 0755)
|
||||
#define LNK_MODE (S_IFLNK | 0777)
|
||||
|
||||
/*
|
||||
* At about 1k files we seem to be writing about 1MB of data, so
|
||||
* set buffer sizes adequately above that.
|
||||
*/
|
||||
#define BATCH_FILES 1024
|
||||
#define BUF_SIZ 2 * 1024 * 1024
|
||||
|
||||
/*
|
||||
* We can't make duplicate inodes for hardlinked files, so we
|
||||
* will need to track these as we generate them. Not too costly
|
||||
* to do, since it's just an integer, and sorting shouldn't matter
|
||||
* until we get into the millions of entries, hopefully.
|
||||
*/
|
||||
static struct list_head hardlinks;
|
||||
struct hardlink_head {
|
||||
struct list_head head;
|
||||
u64 ino;
|
||||
};
|
||||
|
||||
struct opts {
|
||||
char *meta_path;
|
||||
char *source_dir;
|
||||
};
|
||||
|
||||
static bool warn_scoutfs = false;
|
||||
|
||||
static void usage(void)
|
||||
{
|
||||
printf("usage:\n"
|
||||
" -m PATH | path to metadata device\n"
|
||||
" -s PATH | path to source directory\n"
|
||||
);
|
||||
}
|
||||
|
||||
static size_t write_bufs(struct scoutfs_parallel_restore_writer *wri,
|
||||
void *buf, int dev_fd)
|
||||
{
|
||||
size_t total = 0;
|
||||
size_t count;
|
||||
off_t off;
|
||||
int ret;
|
||||
|
||||
do {
|
||||
ret = scoutfs_parallel_restore_write_buf(wri, buf, BUF_SIZ, &off, &count);
|
||||
error_exit(ret, "write buf %d", ret);
|
||||
|
||||
if (count > 0) {
|
||||
ret = pwrite(dev_fd, buf, count, off);
|
||||
error_exit(ret != count, "pwrite count %zu ret %d", count, ret);
|
||||
total += ret;
|
||||
}
|
||||
} while (count > 0);
|
||||
|
||||
return total;
|
||||
}
|
||||
|
||||
struct write_result {
|
||||
struct scoutfs_parallel_restore_progress prog;
|
||||
struct scoutfs_parallel_restore_slice slice;
|
||||
__le64 files_created;
|
||||
__le64 dirs_created;
|
||||
__le64 bytes_written;
|
||||
bool complete;
|
||||
};
|
||||
|
||||
static void write_bufs_and_send(struct scoutfs_parallel_restore_writer *wri,
|
||||
void *buf, int dev_fd,
|
||||
struct write_result *res, bool get_slice, int pair_fd)
|
||||
{
|
||||
size_t total;
|
||||
int ret;
|
||||
|
||||
total = write_bufs(wri, buf, dev_fd);
|
||||
le64_add_cpu(&res->bytes_written, total);
|
||||
|
||||
ret = scoutfs_parallel_restore_get_progress(wri, &res->prog);
|
||||
error_exit(ret, "get prog %d", ret);
|
||||
|
||||
if (get_slice) {
|
||||
ret = scoutfs_parallel_restore_get_slice(wri, &res->slice);
|
||||
error_exit(ret, "thread get slice %d", ret);
|
||||
}
|
||||
|
||||
ret = write(pair_fd, res, sizeof(struct write_result));
|
||||
error_exit(ret != sizeof(struct write_result), "result send error");
|
||||
|
||||
memset(res, 0, sizeof(struct write_result));
|
||||
}
|
||||
|
||||
/*
|
||||
* Adding xattrs is supported for files and directories only.
|
||||
*
|
||||
* If the filesystem on which the path resides isn't scoutfs, we omit the
|
||||
* scoutfs specific ioctl to fetch hidden xattrs.
|
||||
*
|
||||
* Untested if the hidden xattr ioctl works on directories or symlinks.
|
||||
*/
|
||||
static void add_xattrs(struct scoutfs_parallel_restore_writer *wri, char *path, u64 ino, bool is_scoutfs)
|
||||
{
|
||||
struct scoutfs_ioctl_listxattr_hidden lxh;
|
||||
struct scoutfs_parallel_restore_xattr *xattr;
|
||||
char *buf = NULL;
|
||||
char *name = NULL;
|
||||
int fd = -1;
|
||||
int bytes;
|
||||
int len;
|
||||
int value_len;
|
||||
int ret;
|
||||
int pos = 0;
|
||||
|
||||
if (!is_scoutfs)
|
||||
goto normal_xattrs;
|
||||
|
||||
fd = open(path, O_RDONLY);
|
||||
error_exit(fd < 0, "open"ERRF, ERRA);
|
||||
|
||||
memset(&lxh, 0, sizeof(lxh));
|
||||
lxh.id_pos = 0;
|
||||
lxh.hash_pos = 0;
|
||||
lxh.buf_bytes = 256 * 1024;
|
||||
|
||||
buf = malloc(lxh.buf_bytes);
|
||||
error_exit(!buf, "alloc xattr_hidden buf");
|
||||
lxh.buf_ptr = (unsigned long)buf;
|
||||
|
||||
/* hidden */
|
||||
for (;;) {
|
||||
ret = ioctl(fd, SCOUTFS_IOC_LISTXATTR_HIDDEN, &lxh);
|
||||
if (ret == 0) /* done */
|
||||
break;
|
||||
error_exit(ret < 0, "listxattr_hidden"ERRF, ERRA);
|
||||
bytes = ret;
|
||||
error_exit(bytes > lxh.buf_bytes, "listxattr_hidden overflow");
|
||||
error_exit(buf[bytes - 1] != '\0', "listxattr_hidden didn't term");
|
||||
|
||||
name = buf;
|
||||
|
||||
do {
|
||||
len = strlen(name);
|
||||
error_exit(len == 0, "listxattr_hidden empty name");
|
||||
error_exit(len > SCOUTFS_XATTR_MAX_NAME_LEN, "listxattr_hidden long name");
|
||||
|
||||
/* get value len */
|
||||
value_len = fgetxattr(fd, name, NULL, 0);
|
||||
error_exit(value_len < 0, "malloc value hidden"ERRF, ERRA);
|
||||
|
||||
/* allocate everything at once */
|
||||
xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + len + value_len);
|
||||
error_exit(!xattr, "error allocating generated xattr");
|
||||
|
||||
*xattr = (struct scoutfs_parallel_restore_xattr) {
|
||||
.ino = ino,
|
||||
.pos = pos++,
|
||||
.name_len = len,
|
||||
.value_len = value_len,
|
||||
};
|
||||
xattr->name = (void *)(xattr + 1);
|
||||
xattr->value = (void *)(xattr->name + len);
|
||||
|
||||
/* get value into xattr directly */
|
||||
ret = fgetxattr(fd, name, (void *)(xattr->name + len), value_len);
|
||||
error_exit(ret != value_len, "fgetxattr value"ERRF, ERRA);
|
||||
|
||||
memcpy(xattr->name, name, len);
|
||||
|
||||
ret = scoutfs_parallel_restore_add_xattr(wri, xattr);
|
||||
error_exit(ret, "add hidden xattr %d", ret);
|
||||
|
||||
free(xattr);
|
||||
|
||||
name += len + 1;
|
||||
bytes -= len + 1;
|
||||
} while (bytes > 0);
|
||||
}
|
||||
|
||||
free(buf);
|
||||
close(fd);
|
||||
|
||||
normal_xattrs:
|
||||
value_len = listxattr(path, NULL, 0);
|
||||
error_exit(value_len < 0, "hidden listxattr "ERRF, ERRA);
|
||||
if (value_len == 0)
|
||||
return;
|
||||
|
||||
buf = calloc(1, value_len);
|
||||
error_exit(!buf, "malloc value"ERRF, ERRA);
|
||||
|
||||
ret = listxattr(path, buf, value_len);
|
||||
error_exit(ret < 0, "hidden listxattr %d", ret);
|
||||
|
||||
name = buf;
|
||||
bytes = ret;
|
||||
do {
|
||||
len = strlen(name);
|
||||
|
||||
error_exit(len == 0, "listxattr_hidden empty name");
|
||||
error_exit(len > SCOUTFS_XATTR_MAX_NAME_LEN, "listxattr_hidden long name");
|
||||
|
||||
value_len = getxattr(path, name, NULL, 0);
|
||||
error_exit(value_len < 0, "value "ERRF, ERRA);
|
||||
|
||||
xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + len + value_len);
|
||||
error_exit(!xattr, "error allocating generated xattr");
|
||||
|
||||
*xattr = (struct scoutfs_parallel_restore_xattr) {
|
||||
.ino = ino,
|
||||
.pos = pos++,
|
||||
.name_len = len,
|
||||
.value_len = value_len,
|
||||
};
|
||||
xattr->name = (void *)(xattr + 1);
|
||||
xattr->value = (void *)(xattr->name + len);
|
||||
|
||||
ret = getxattr(path, name, (void *)(xattr->name + len), value_len);
|
||||
error_exit(ret != value_len, "fgetxattr value"ERRF, ERRA);
|
||||
|
||||
memcpy(xattr->name, name, len);
|
||||
|
||||
ret = scoutfs_parallel_restore_add_xattr(wri, xattr);
|
||||
error_exit(ret, "add xattr %d", ret);
|
||||
|
||||
free(xattr);
|
||||
|
||||
name += len + 1;
|
||||
bytes -= len + 1;
|
||||
} while (bytes > 0);
|
||||
|
||||
free(buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* We can't store the same inode multiple times, so we need to make
|
||||
* sure to account for hardlinks. Maintain a LL that stores the first
|
||||
* hardlink inode we encounter, and every subsequent hardlink to this
|
||||
* inode will omit inserting an inode, and just adds another entry
|
||||
*/
|
||||
static bool is_new_inode_item(bool nlink, u64 ino)
|
||||
{
|
||||
struct hardlink_head *hh_tmp;
|
||||
struct hardlink_head *hh;
|
||||
|
||||
if (!nlink)
|
||||
return true;
|
||||
|
||||
/* lineair search, pretty awful, should be a binary tree */
|
||||
list_for_each_entry_safe(hh, hh_tmp, &hardlinks, head) {
|
||||
if (hh->ino == ino)
|
||||
return false;
|
||||
}
|
||||
|
||||
/* insert item */
|
||||
hh = malloc(sizeof(struct hardlink_head));
|
||||
error_exit(!hh, "malloc");
|
||||
hh->ino = ino;
|
||||
list_add_tail(&hh->head, &hardlinks);
|
||||
|
||||
/*
|
||||
* XXX
|
||||
*
|
||||
* We can be confident that if we don't traverse filesystems
|
||||
* that once we've created N entries of an N-linked inode, that
|
||||
* it can be removed from the LL. This would significantly
|
||||
* improve the manageability of the list.
|
||||
*
|
||||
* All we'd need to do is add a counter and compare it to the nr_links
|
||||
* field of the inode.
|
||||
*/
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* create the inode data for a given path as best as possible
|
||||
* duplicating the exact data from the source path
|
||||
*/
|
||||
static struct scoutfs_parallel_restore_inode *read_inode_data(char *path, u64 ino, bool *nlink, bool is_scoutfs)
|
||||
{
|
||||
struct scoutfs_parallel_restore_inode *inode = NULL;
|
||||
struct scoutfs_ioctl_stat_more stm;
|
||||
struct stat st;
|
||||
int ret;
|
||||
int fd;
|
||||
|
||||
inode = calloc(1, sizeof(struct scoutfs_parallel_restore_inode));
|
||||
error_exit(!inode, "failure allocating inode");
|
||||
|
||||
ret = lstat(path, &st);
|
||||
error_exit(ret, "failure stat inode");
|
||||
|
||||
/* use exact inode numbers from path, except for root ino */
|
||||
if (ino != SCOUTFS_ROOT_INO)
|
||||
inode->ino = st.st_ino;
|
||||
else
|
||||
inode->ino = SCOUTFS_ROOT_INO;
|
||||
|
||||
inode->mode = st.st_mode;
|
||||
inode->uid = st.st_uid;
|
||||
inode->gid = st.st_gid;
|
||||
inode->atime = st.st_atim;
|
||||
inode->ctime = st.st_ctim;
|
||||
inode->mtime = st.st_mtim;
|
||||
inode->size = st.st_size;
|
||||
|
||||
inode->rdev = st.st_rdev;
|
||||
|
||||
/* scoutfs specific */
|
||||
inode->meta_seq = 0;
|
||||
inode->data_seq = 0;
|
||||
inode->crtime = st.st_ctim;
|
||||
|
||||
if (S_ISREG(inode->mode)) {
|
||||
if (inode->size > 0)
|
||||
inode->offline = true;
|
||||
|
||||
if (is_scoutfs) {
|
||||
fd = open(path, O_RDONLY);
|
||||
error_exit(!fd, "open failure"ERRF, ERRA);
|
||||
|
||||
ret = ioctl(fd, SCOUTFS_IOC_STAT_MORE, &stm);
|
||||
error_exit(ret, "failure SCOUTFS_IOC_STAT_MORE inode");
|
||||
|
||||
inode->meta_seq = stm.meta_seq;
|
||||
inode->data_seq = stm.data_seq;
|
||||
inode->crtime = (struct timespec){.tv_sec = stm.crtime_sec, .tv_nsec = stm.crtime_nsec};
|
||||
|
||||
close(fd);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* pass whether item is hardlinked or not */
|
||||
*nlink = (st.st_nlink > 1);
|
||||
|
||||
return inode;
|
||||
}
|
||||
|
||||
struct writer_args {
|
||||
struct list_head head;
|
||||
|
||||
int dev_fd;
|
||||
int pair_fd;
|
||||
|
||||
struct scoutfs_parallel_restore_slice slice;
|
||||
};
|
||||
|
||||
static void restore_path(struct scoutfs_parallel_restore_writer *wri, struct writer_args *args, struct write_result *res, void *buf, char *path, u64 ino)
|
||||
{
|
||||
struct scoutfs_parallel_restore_inode *inode;
|
||||
struct scoutfs_parallel_restore_entry *entry;
|
||||
DIR *dirp = NULL;
|
||||
char *subdir = NULL;
|
||||
char link[PATH_MAX + 1];
|
||||
struct dirent *ent;
|
||||
struct statfs stf;
|
||||
int ret = 0;
|
||||
int subdir_count = 0, file_count = 0;
|
||||
size_t ent_len = 0;
|
||||
size_t pos = 0;
|
||||
bool nlink = false;
|
||||
char ind = '?';
|
||||
u64 mode;
|
||||
bool is_scoutfs = false;
|
||||
|
||||
/* get fs info once per path */
|
||||
ret = statfs(path, &stf);
|
||||
error_exit(ret != 0, "statfs"ERRF, ERRA);
|
||||
is_scoutfs = (stf.f_type == 0x554f4353);
|
||||
|
||||
if (!is_scoutfs && !warn_scoutfs) {
|
||||
warn_scoutfs = true;
|
||||
fprintf(stderr, "Non-scoutfs source path detected: scoutfs specific features disabled\n");
|
||||
}
|
||||
|
||||
/* traverse the entire tree */
|
||||
dirp = opendir(path);
|
||||
errno = 0;
|
||||
while ((ent = readdir(dirp))) {
|
||||
if (ent->d_type == DT_DIR) {
|
||||
if ((strcmp(ent->d_name, ".") == 0) ||
|
||||
(strcmp(ent->d_name, "..") == 0)) {
|
||||
/* position still matters */
|
||||
pos++;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* recurse into subdir */
|
||||
ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
|
||||
error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
|
||||
restore_path(wri, args, res, buf, subdir, ent->d_ino);
|
||||
|
||||
subdir_count++;
|
||||
|
||||
ent_len += strlen(ent->d_name);
|
||||
|
||||
entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
|
||||
error_exit(!entry, "error allocating generated entry");
|
||||
|
||||
*entry = (struct scoutfs_parallel_restore_entry) {
|
||||
.dir_ino = ino,
|
||||
.pos = pos++,
|
||||
.ino = ent->d_ino,
|
||||
.mode = DIR_MODE,
|
||||
.name = (void *)(entry + 1),
|
||||
.name_len = strlen(ent->d_name),
|
||||
};
|
||||
|
||||
memcpy(entry->name, ent->d_name, strlen(ent->d_name));
|
||||
ret = scoutfs_parallel_restore_add_entry(wri, entry);
|
||||
error_exit(ret, "add entry %d", ret);
|
||||
free(entry);
|
||||
|
||||
add_xattrs(wri, subdir, ent->d_ino, is_scoutfs);
|
||||
|
||||
free(subdir);
|
||||
|
||||
le64_add_cpu(&res->dirs_created, 1);
|
||||
} else if (ent->d_type == DT_REG) {
|
||||
|
||||
file_count++;
|
||||
|
||||
ent_len += strlen(ent->d_name);
|
||||
|
||||
entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
|
||||
error_exit(!entry, "error allocating generated entry");
|
||||
|
||||
*entry = (struct scoutfs_parallel_restore_entry) {
|
||||
.dir_ino = ino,
|
||||
.pos = pos++,
|
||||
.ino = ent->d_ino,
|
||||
.mode = REG_MODE,
|
||||
.name = (void *)(entry + 1),
|
||||
.name_len = strlen(ent->d_name),
|
||||
};
|
||||
|
||||
memcpy(entry->name, ent->d_name, strlen(ent->d_name));
|
||||
ret = scoutfs_parallel_restore_add_entry(wri, entry);
|
||||
error_exit(ret, "add entry %d", ret);
|
||||
free(entry);
|
||||
|
||||
ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
|
||||
error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
|
||||
|
||||
/* file inode */
|
||||
inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs);
|
||||
fprintf(stdout, "f %s/%s\n", path, ent->d_name);
|
||||
if (is_new_inode_item(nlink, ent->d_ino)) {
|
||||
ret = scoutfs_parallel_restore_add_inode(wri, inode);
|
||||
error_exit(ret, "add reg file inode %d", ret);
|
||||
|
||||
/* xattrs */
|
||||
add_xattrs(wri, subdir, ent->d_ino, is_scoutfs);
|
||||
}
|
||||
free(inode);
|
||||
|
||||
free(subdir);
|
||||
|
||||
le64_add_cpu(&res->files_created, 1);
|
||||
} else if (ent->d_type == DT_LNK) {
|
||||
/* readlink */
|
||||
|
||||
ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
|
||||
error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
|
||||
|
||||
ent_len += strlen(ent->d_name);
|
||||
|
||||
ret = readlink(subdir, link, PATH_MAX);
|
||||
error_exit(ret < 0, "readlink %d", ret);
|
||||
/* must 0-terminate if we want to print it */
|
||||
link[ret] = 0;
|
||||
|
||||
entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
|
||||
error_exit(!entry, "error allocating generated entry");
|
||||
|
||||
*entry = (struct scoutfs_parallel_restore_entry) {
|
||||
.dir_ino = ino,
|
||||
.pos = pos++,
|
||||
.ino = ent->d_ino,
|
||||
.mode = LNK_MODE,
|
||||
.name = (void *)(entry + 1),
|
||||
.name_len = strlen(ent->d_name),
|
||||
};
|
||||
|
||||
memcpy(entry->name, ent->d_name, strlen(ent->d_name));
|
||||
ret = scoutfs_parallel_restore_add_entry(wri, entry);
|
||||
error_exit(ret, "add symlink entry %d", ret);
|
||||
|
||||
/* link inode */
|
||||
inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs);
|
||||
|
||||
fprintf(stdout, "l %s/%s -> %s\n", path, ent->d_name, link);
|
||||
|
||||
inode->mode = LNK_MODE;
|
||||
inode->target = link;
|
||||
inode->target_len = strlen(link) + 1; /* scoutfs null terminates symlinks */
|
||||
|
||||
ret = scoutfs_parallel_restore_add_inode(wri, inode);
|
||||
error_exit(ret, "add syml inode %d", ret);
|
||||
|
||||
free(inode);
|
||||
free(subdir);
|
||||
|
||||
le64_add_cpu(&res->files_created, 1);
|
||||
} else {
|
||||
/* odd stuff */
|
||||
switch(ent->d_type) {
|
||||
case DT_CHR:
|
||||
ind = 'c';
|
||||
mode = S_IFCHR;
|
||||
break;
|
||||
case DT_BLK:
|
||||
ind = 'b';
|
||||
mode = S_IFBLK;
|
||||
break;
|
||||
case DT_FIFO:
|
||||
ind = 'p';
|
||||
mode = S_IFIFO;
|
||||
break;
|
||||
case DT_SOCK:
|
||||
ind = 's';
|
||||
mode = S_IFSOCK;
|
||||
break;
|
||||
default:
|
||||
error_exit(true, "Unknown readdir entry type");
|
||||
;;
|
||||
}
|
||||
|
||||
file_count++;
|
||||
|
||||
ent_len += strlen(ent->d_name);
|
||||
|
||||
entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + strlen(ent->d_name));
|
||||
error_exit(!entry, "error allocating generated entry");
|
||||
|
||||
*entry = (struct scoutfs_parallel_restore_entry) {
|
||||
.dir_ino = ino,
|
||||
.pos = pos++,
|
||||
.ino = ent->d_ino,
|
||||
.mode = mode,
|
||||
.name = (void *)(entry + 1),
|
||||
.name_len = strlen(ent->d_name),
|
||||
};
|
||||
|
||||
memcpy(entry->name, ent->d_name, strlen(ent->d_name));
|
||||
ret = scoutfs_parallel_restore_add_entry(wri, entry);
|
||||
error_exit(ret, "add entry %d", ret);
|
||||
|
||||
free(entry);
|
||||
|
||||
ret = asprintf(&subdir, "%s/%s", path, ent->d_name);
|
||||
error_exit(ret == -1, "asprintf subdir"ERRF, ERRA);
|
||||
|
||||
/* file inode */
|
||||
inode = read_inode_data(subdir, ent->d_ino, &nlink, is_scoutfs);
|
||||
fprintf(stdout, "%c %lld %s/%s\n", ind, inode->ino, path, ent->d_name);
|
||||
if (is_new_inode_item(nlink, ent->d_ino)) {
|
||||
ret = scoutfs_parallel_restore_add_inode(wri, inode);
|
||||
error_exit(ret, "add reg file inode %d", ret);
|
||||
}
|
||||
free(inode);
|
||||
|
||||
free(subdir);
|
||||
|
||||
le64_add_cpu(&res->files_created, 1);
|
||||
}
|
||||
|
||||
/* batch out changes, will be about 1M */
|
||||
if (le64_to_cpu(res->files_created) > BATCH_FILES) {
|
||||
write_bufs_and_send(wri, buf, args->dev_fd, res, false, args->pair_fd);
|
||||
}
|
||||
|
||||
}
|
||||
if (ent != NULL)
|
||||
error_exit(errno, "readdir"ERRF, ERRA);
|
||||
closedir(dirp);
|
||||
|
||||
/* create the dir itself */
|
||||
inode = read_inode_data(path, ino, &nlink, is_scoutfs);
|
||||
inode->nr_subdirs = subdir_count;
|
||||
inode->total_entry_name_bytes = ent_len;
|
||||
fprintf(stdout, "d %s\n", path);
|
||||
|
||||
ret = scoutfs_parallel_restore_add_inode(wri, inode);
|
||||
error_exit(ret, "add dir inode %d", ret);
|
||||
|
||||
free(inode);
|
||||
|
||||
/* No need to send, we'll send final after last directory is complete */
|
||||
}
|
||||
|
||||
static int do_restore(struct opts *opts)
|
||||
{
|
||||
struct scoutfs_parallel_restore_writer *pwri, *wri = NULL;
|
||||
struct scoutfs_parallel_restore_slice *slices = NULL;
|
||||
struct scoutfs_super_block *super = NULL;
|
||||
struct writer_args *args;
|
||||
struct write_result res;
|
||||
int pair[2] = {-1, -1};
|
||||
LIST_HEAD(writers);
|
||||
void *buf = NULL;
|
||||
void *bufp = NULL;
|
||||
int dev_fd = -1;
|
||||
pid_t pid;
|
||||
int ret;
|
||||
u64 tot_bytes;
|
||||
u64 tot_dirs;
|
||||
u64 tot_files;
|
||||
|
||||
ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
|
||||
error_exit(ret, "socketpair error "ERRF, ERRA);
|
||||
|
||||
dev_fd = open(opts->meta_path, O_DIRECT | (O_RDWR|O_EXCL));
|
||||
error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA);
|
||||
|
||||
errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?:
|
||||
posix_memalign((void **)&buf, 4096, BUF_SIZ);
|
||||
error_exit(errno, "error allocating block bufs "ERRF, ERRA);
|
||||
|
||||
ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
|
||||
SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
|
||||
error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret);
|
||||
|
||||
error_exit((super->flags & SCOUTFS_FLAG_IS_META_BDEV) == 0, "super block is not meta dev");
|
||||
|
||||
ret = scoutfs_parallel_restore_create_writer(&wri);
|
||||
error_exit(ret, "create writer %d", ret);
|
||||
|
||||
ret = scoutfs_parallel_restore_import_super(wri, super, dev_fd);
|
||||
error_exit(ret, "import super %d", ret);
|
||||
|
||||
slices = calloc(2, sizeof(struct scoutfs_parallel_restore_slice));
|
||||
error_exit(!slices, "alloc slices");
|
||||
|
||||
scoutfs_parallel_restore_init_slices(wri, slices, 2);
|
||||
|
||||
ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]);
|
||||
error_exit(ret, "add slices[0] %d", ret);
|
||||
|
||||
args = calloc(1, sizeof(struct writer_args));
|
||||
error_exit(!args, "alloc writer args");
|
||||
|
||||
args->dev_fd = dev_fd;
|
||||
args->slice = slices[1];
|
||||
args->pair_fd = pair[1];
|
||||
list_add_tail(&args->head, &writers);
|
||||
|
||||
/* fork writer process */
|
||||
pid = fork();
|
||||
error_exit(pid == -1, "fork error");
|
||||
|
||||
if (pid == 0) {
|
||||
ret = prctl(PR_SET_PDEATHSIG, SIGHUP);
|
||||
error_exit(ret < 0, "failed to set parent death sig");
|
||||
|
||||
errno = posix_memalign((void **)&bufp, 4096, BUF_SIZ);
|
||||
error_exit(errno, "error allocating block bufp "ERRF, ERRA);
|
||||
|
||||
ret = scoutfs_parallel_restore_create_writer(&pwri);
|
||||
error_exit(ret, "create pwriter %d", ret);
|
||||
|
||||
ret = scoutfs_parallel_restore_add_slice(pwri, &args->slice);
|
||||
error_exit(ret, "add pslice %d", ret);
|
||||
|
||||
memset(&res, 0, sizeof(res));
|
||||
|
||||
restore_path(pwri, args, &res, bufp, opts->source_dir, SCOUTFS_ROOT_INO);
|
||||
|
||||
res.complete = true;
|
||||
|
||||
write_bufs_and_send(pwri, buf, args->dev_fd, &res, true, args->pair_fd);
|
||||
|
||||
scoutfs_parallel_restore_destroy_writer(&pwri);
|
||||
free(bufp);
|
||||
|
||||
exit(0);
|
||||
};
|
||||
|
||||
/* read results and wait for writer to finish */
|
||||
tot_bytes = 0;
|
||||
tot_dirs = 1;
|
||||
tot_files = 0;
|
||||
for (;;) {
|
||||
ret = read(pair[0], &res, sizeof(struct write_result));
|
||||
error_exit(ret != sizeof(struct write_result), "result read error %d", ret);
|
||||
|
||||
ret = scoutfs_parallel_restore_add_progress(wri, &res.prog);
|
||||
error_exit(ret, "add thr prog %d", ret);
|
||||
|
||||
if (res.slice.meta_len != 0) {
|
||||
ret = scoutfs_parallel_restore_add_slice(wri, &res.slice);
|
||||
error_exit(ret, "add thr slice %d", ret);
|
||||
|
||||
if (res.complete)
|
||||
break;
|
||||
}
|
||||
|
||||
tot_bytes += le64_to_cpu(res.bytes_written);
|
||||
tot_files += le64_to_cpu(res.files_created);
|
||||
tot_dirs += le64_to_cpu(res.dirs_created);
|
||||
}
|
||||
|
||||
tot_bytes += write_bufs(wri, buf, args->dev_fd);
|
||||
|
||||
fprintf(stdout, "Wrote %lld directories, %lld files, %lld bytes total\n",
|
||||
tot_dirs, tot_files, tot_bytes);
|
||||
|
||||
/* write super to finalize */
|
||||
ret = scoutfs_parallel_restore_export_super(wri, super);
|
||||
error_exit(ret, "update super %d", ret);
|
||||
|
||||
ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
|
||||
SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
|
||||
error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret);
|
||||
|
||||
scoutfs_parallel_restore_destroy_writer(&wri);
|
||||
|
||||
if (dev_fd >= 0)
|
||||
close(dev_fd);
|
||||
if (pair[0] > 0)
|
||||
close(pair[0]);
|
||||
if (pair[1] > 0)
|
||||
close(pair[1]);
|
||||
free(super);
|
||||
free(args);
|
||||
free(slices);
|
||||
free(buf);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct opts opts = (struct opts){ 0 };
|
||||
struct hardlink_head *hh_tmp;
|
||||
struct hardlink_head *hh;
|
||||
int ret;
|
||||
int c;
|
||||
|
||||
INIT_LIST_HEAD(&hardlinks);
|
||||
|
||||
while ((c = getopt(argc, argv, "b:m:s:")) != -1) {
|
||||
switch(c) {
|
||||
case 'm':
|
||||
opts.meta_path = strdup(optarg);
|
||||
break;
|
||||
case 's':
|
||||
opts.source_dir = strdup(optarg);
|
||||
break;
|
||||
case '?':
|
||||
printf("Unknown option '%c'\n", optopt);
|
||||
usage();
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
error_exit(!opts.meta_path, "must specify metadata device path with -m");
|
||||
error_exit(!opts.source_dir, "must specify source directory path with -s");
|
||||
|
||||
ret = do_restore(&opts);
|
||||
|
||||
free(opts.meta_path);
|
||||
free(opts.source_dir);
|
||||
|
||||
list_for_each_entry_safe(hh, hh_tmp, &hardlinks, head) {
|
||||
list_del_init(&hh->head);
|
||||
free(hh);
|
||||
}
|
||||
|
||||
return ret == 0 ? 0 : 1;
|
||||
}
|
||||
78
tests/tests/parallel_restore.sh
Normal file
78
tests/tests/parallel_restore.sh
Normal file
@@ -0,0 +1,78 @@
|
||||
#
|
||||
# validate parallel restore library
|
||||
#
|
||||
|
||||
t_require_commands scoutfs parallel_restore find xargs
|
||||
|
||||
SCR="$T_TMPDIR/mnt.scratch"
|
||||
mkdir -p "$SCR"
|
||||
|
||||
scratch_mkfs() {
|
||||
scoutfs mkfs $@ \
|
||||
-A -f -Q 0,127.0.0.1,53000 $T_EX_META_DEV $T_EX_DATA_DEV
|
||||
}
|
||||
|
||||
scratch_check() {
|
||||
# give ample time for writes to commit
|
||||
sleep 1
|
||||
sync
|
||||
scoutfs check -d ${T_TMPDIR}/check.debug $T_EX_META_DEV $T_EX_DATA_DEV
|
||||
}
|
||||
|
||||
scratch_mount() {
|
||||
mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 $T_EX_DATA_DEV $SCR
|
||||
}
|
||||
|
||||
|
||||
echo "== simple mkfs/restore/mount"
|
||||
# meta device just big enough for reserves and the metadata we'll fill
|
||||
scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
|
||||
parallel_restore -m "$T_EX_META_DEV" > /dev/null || t_fail "parallel_restore"
|
||||
scratch_check || t_fail "check failed"
|
||||
scratch_mount
|
||||
|
||||
scoutfs statfs -p "$SCR" | grep -v -e 'fsid' -e 'rid'
|
||||
find "$SCR" -exec scoutfs list-hidden-xattrs {} \; | wc
|
||||
scoutfs search-xattrs -p "$SCR" scoutfs.hide.srch.sam_vol_F01030L6 -p "$SCR" | wc
|
||||
find "$SCR" -type f -name "file-*" | head -n 4 | xargs -n 1 scoutfs get-fiemap -L
|
||||
scoutfs df -p "$SCR"
|
||||
scoutfs quota-list -p "$SCR"
|
||||
umount "$SCR"
|
||||
scratch_check || t_fail "check after mount failed"
|
||||
|
||||
echo "== just under ENOSPC"
|
||||
scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
|
||||
parallel_restore -m "$T_EX_META_DEV" -n 3000000 > /dev/null || t_fail "parallel_restore"
|
||||
scratch_check || t_fail "check failed"
|
||||
scratch_mount
|
||||
scoutfs df -p "$SCR"
|
||||
umount "$SCR"
|
||||
scratch_check || t_fail "check after mount failed"
|
||||
|
||||
echo "== just over ENOSPC"
|
||||
scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
|
||||
parallel_restore -m "$T_EX_META_DEV" -n 3500000 | grep died 2>&1 && t_fail "parallel_restore"
|
||||
scratch_check || t_fail "check failed"
|
||||
|
||||
echo "== ENOSPC"
|
||||
scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
|
||||
parallel_restore -m "$T_EX_META_DEV" -d 600:1000 -f 600:1000 -n 4000000 | grep died 2>&1 && t_fail "parallel_restore"
|
||||
|
||||
echo "== attempt to restore data device"
|
||||
scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
|
||||
parallel_restore -m "$T_EX_DATA_DEV" | grep died 2>&1 && t_fail "parallel_restore"
|
||||
|
||||
echo "== attempt format_v1 restore"
|
||||
scratch_mkfs -V 1 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
|
||||
parallel_restore -m "$T_EX_META_DEV" | grep died 2>&1 && t_fail "parallel_restore"
|
||||
|
||||
echo "== test if previously mounted"
|
||||
scratch_mkfs -V 2 -m 10G -d 60G > $T_TMP.mkfs.out 2>&1 || t_fail "mkfs failed"
|
||||
mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
|
||||
"$T_EX_DATA_DEV" "$SCR"
|
||||
umount "$SCR"
|
||||
parallel_restore -m "$T_EX_META_DEV" | grep died 2>&1 && t_fail "parallel_restore"
|
||||
|
||||
echo "== cleanup"
|
||||
rmdir "$SCR"
|
||||
t_pass
|
||||
@@ -7,7 +7,7 @@ FMTIOC_H := format.h ioctl.h
|
||||
FMTIOC_KMOD := $(addprefix ../kmod/src/,$(FMTIOC_H))
|
||||
|
||||
CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -g -msse4.2 \
|
||||
-fno-strict-aliasing \
|
||||
-I src/ -fno-strict-aliasing \
|
||||
-DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU
|
||||
|
||||
ifneq ($(wildcard $(firstword $(FMTIOC_KMOD))),)
|
||||
@@ -15,10 +15,13 @@ CFLAGS += -I../kmod/src
|
||||
endif
|
||||
|
||||
BIN := src/scoutfs
|
||||
OBJ := $(patsubst %.c,%.o,$(wildcard src/*.c))
|
||||
DEPS := $(wildcard */*.d)
|
||||
OBJ_DIRS := src src/check
|
||||
OBJ := $(foreach dir,$(OBJ_DIRS),$(patsubst %.c,%.o,$(wildcard $(dir)/*.c)))
|
||||
DEPS := $(foreach dir,$(OBJ_DIRS),$(wildcard $(dir)/*.d))
|
||||
|
||||
all: $(BIN)
|
||||
AR := src/scoutfs_parallel_restore.a
|
||||
|
||||
all: $(BIN) $(AR)
|
||||
|
||||
ifneq ($(DEPS),)
|
||||
-include $(DEPS)
|
||||
@@ -36,6 +39,10 @@ $(BIN): $(OBJ)
|
||||
$(QU) [BIN $@]
|
||||
$(VE)gcc -o $@ $^ -luuid -lm -lcrypto -lblkid
|
||||
|
||||
$(AR): $(OBJ)
|
||||
$(QU) [AR $@]
|
||||
$(VE)ar rcs $@ $^
|
||||
|
||||
%.o %.d: %.c Makefile sparse.sh
|
||||
$(QU) [CC $<]
|
||||
$(VE)gcc $(CFLAGS) -MD -MP -MF $*.d -c $< -o $*.o
|
||||
|
||||
@@ -76,6 +76,41 @@ run when the file system will not be mounted.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "check META-DEVICE DATA-DEVICE [-d|--debug FILE]"
|
||||
.sp
|
||||
Performs an offline file system check. The program iterates through all the
|
||||
data structures on disk directly - the filesystem must not be mounted while
|
||||
this operation is running.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.sp
|
||||
.TP
|
||||
.B "-d, --debug FILE"
|
||||
An output file where the program can output debug information about the
|
||||
state of the filesystem as it performs the check. If
|
||||
.B FILE
|
||||
is "-", the debug output is written to the Standard Error output.
|
||||
.TP
|
||||
.RE
|
||||
.sp
|
||||
.B RETURN VALUE
|
||||
The check function can return the following exit codes:
|
||||
.RS
|
||||
.TP
|
||||
\fB 0 \fR - no filesystem issues detected
|
||||
.TP
|
||||
\fB 1 \fR - file system issues were detected
|
||||
.TP
|
||||
\fB 8 \fR - operational error
|
||||
.TP
|
||||
\fB 16 \fR - usage error
|
||||
.TP
|
||||
\fB 32 \fR - cancelled by user (SIGINT)
|
||||
.TP
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "counters [-t|--table] SYSFS-DIR"
|
||||
.sp
|
||||
|
||||
@@ -54,6 +54,8 @@ cp man/*.8.gz $RPM_BUILD_ROOT%{_mandir}/man8/.
|
||||
install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs
|
||||
install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h
|
||||
install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h
|
||||
install -m 644 -D src/parallel_restore.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/parallel_restore.h
|
||||
install -m 644 -D src/scoutfs_parallel_restore.a $RPM_BUILD_ROOT%{_libdir}/scoutfs/libscoutfs_parallel_restore.a
|
||||
install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced
|
||||
install -m 644 -D fenced/scoutfs-fenced.service $RPM_BUILD_ROOT%{_unitdir}/scoutfs-fenced.service
|
||||
install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/scoutfs/scoutfs-fenced.conf.example
|
||||
@@ -70,6 +72,7 @@ install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdi
|
||||
%files -n scoutfs-devel
|
||||
%defattr(644,root,root,755)
|
||||
%{_includedir}/scoutfs
|
||||
%{_libdir}/scoutfs
|
||||
|
||||
%clean
|
||||
rm -rf %{buildroot}
|
||||
|
||||
@@ -10,6 +10,11 @@
|
||||
* Just a quick simple native bitmap.
|
||||
*/
|
||||
|
||||
int test_bit(unsigned long *bits, u64 nr)
|
||||
{
|
||||
return !!(bits[nr / BITS_PER_LONG] & (1UL << (nr & (BITS_PER_LONG - 1))));
|
||||
}
|
||||
|
||||
void set_bit(unsigned long *bits, u64 nr)
|
||||
{
|
||||
bits[nr / BITS_PER_LONG] |= 1UL << (nr & (BITS_PER_LONG - 1));
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#ifndef _BITMAP_H_
|
||||
#define _BITMAP_H_
|
||||
|
||||
int test_bit(unsigned long *bits, u64 nr);
|
||||
void set_bit(unsigned long *bits, u64 nr);
|
||||
void clear_bit(unsigned long *bits, u64 nr);
|
||||
u64 find_next_set_bit(unsigned long *start, u64 from, u64 total);
|
||||
|
||||
20
utils/src/bloom.c
Normal file
20
utils/src/bloom.c
Normal file
@@ -0,0 +1,20 @@
|
||||
#include <errno.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "hash.h"
|
||||
#include "bloom.h"
|
||||
|
||||
void calc_bloom_nrs(struct scoutfs_key *key, unsigned int *nrs)
|
||||
{
|
||||
u64 hash;
|
||||
int i;
|
||||
|
||||
hash = scoutfs_hash64(key, sizeof(struct scoutfs_key));
|
||||
|
||||
for (i = 0; i < SCOUTFS_FOREST_BLOOM_NRS; i++) {
|
||||
nrs[i] = (u32)hash % SCOUTFS_FOREST_BLOOM_BITS;
|
||||
hash >>= SCOUTFS_FOREST_BLOOM_FUNC_BITS;
|
||||
}
|
||||
}
|
||||
6
utils/src/bloom.h
Normal file
6
utils/src/bloom.h
Normal file
@@ -0,0 +1,6 @@
|
||||
#ifndef _BLOOM_H_
|
||||
#define _BLOOM_H_
|
||||
|
||||
void calc_bloom_nrs(struct scoutfs_key *key, unsigned int *nrs);
|
||||
|
||||
#endif
|
||||
@@ -8,7 +8,7 @@
|
||||
#include "leaf_item_hash.h"
|
||||
#include "btree.h"
|
||||
|
||||
static void init_block(struct scoutfs_btree_block *bt, int level)
|
||||
void btree_init_block(struct scoutfs_btree_block *bt, int level)
|
||||
{
|
||||
int free;
|
||||
|
||||
@@ -33,7 +33,7 @@ void btree_init_root_single(struct scoutfs_btree_root *root,
|
||||
|
||||
memset(bt, 0, SCOUTFS_BLOCK_LG_SIZE);
|
||||
|
||||
init_block(bt, 0);
|
||||
btree_init_block(bt, 0);
|
||||
}
|
||||
|
||||
static void *alloc_val(struct scoutfs_btree_block *bt, int len)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#ifndef _BTREE_H_
|
||||
#define _BTREE_H_
|
||||
|
||||
void btree_init_block(struct scoutfs_btree_block *bt, int level);
|
||||
void btree_init_root_single(struct scoutfs_btree_root *root,
|
||||
struct scoutfs_btree_block *bt,
|
||||
u64 seq, u64 blkno);
|
||||
|
||||
166
utils/src/check/alloc.c
Normal file
166
utils/src/check/alloc.c
Normal file
@@ -0,0 +1,166 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <sys/mman.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "bitmap.h"
|
||||
#include "key.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "block.h"
|
||||
#include "btree.h"
|
||||
#include "extent.h"
|
||||
#include "iter.h"
|
||||
#include "sns.h"
|
||||
|
||||
/*
|
||||
* We check the list blocks serially.
|
||||
*
|
||||
* XXX:
|
||||
* - compare ref seqs
|
||||
* - detect cycles?
|
||||
*/
|
||||
int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
|
||||
{
|
||||
struct scoutfs_alloc_list_block *lblk;
|
||||
struct scoutfs_block_ref ref;
|
||||
struct block *blk = NULL;
|
||||
u64 blkno;
|
||||
int ret;
|
||||
|
||||
ref = lhead->ref;
|
||||
|
||||
while (ref.blkno) {
|
||||
blkno = le64_to_cpu(ref.blkno);
|
||||
|
||||
ret = cb(blkno, 1, cb_arg);
|
||||
if (ret < 0) {
|
||||
ret = xlate_iter_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = block_get(&blk, blkno, 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
lblk = block_buf(blk);
|
||||
/* XXX verify block */
|
||||
ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_ALLOC_LIST);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* XXX sort? maybe */
|
||||
|
||||
ref = lblk->next;
|
||||
|
||||
block_put(&blk);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
|
||||
{
|
||||
return btree_meta_iter(&root->root, cb, cb_arg);
|
||||
}
|
||||
|
||||
int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
|
||||
{
|
||||
struct scoutfs_alloc_list_block *lblk;
|
||||
struct scoutfs_block_ref ref;
|
||||
struct block *blk = NULL;
|
||||
u64 blkno;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
ref = lhead->ref;
|
||||
|
||||
while (ref.blkno) {
|
||||
blkno = le64_to_cpu(ref.blkno);
|
||||
|
||||
ret = block_get(&blk, blkno, 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("alloc_list_block", blkno, 0);
|
||||
|
||||
lblk = block_buf(blk);
|
||||
/* XXX verify block */
|
||||
ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_ALLOC_LIST);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
/* XXX sort? maybe */
|
||||
|
||||
ret = 0;
|
||||
for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
|
||||
blkno = le64_to_cpu(lblk->blknos[le32_to_cpu(lblk->start) + i]);
|
||||
|
||||
ret = cb(blkno, 1, cb_arg);
|
||||
if (ret < 0)
|
||||
break;
|
||||
}
|
||||
|
||||
ref = lblk->next;
|
||||
|
||||
block_put(&blk);
|
||||
sns_pop();
|
||||
if (ret < 0) {
|
||||
ret = xlate_iter_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool valid_free_extent_key(struct scoutfs_key *key)
|
||||
{
|
||||
return (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE ||
|
||||
key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) &&
|
||||
(!key->_sk_fourth && !key->sk_type &&
|
||||
(key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE || !key->_sk_third));
|
||||
}
|
||||
|
||||
static int free_item_cb(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
|
||||
{
|
||||
struct extent_cb_arg_t *ecba = cb_arg;
|
||||
u64 start;
|
||||
u64 len;
|
||||
|
||||
/* XXX not sure these eios are what we want */
|
||||
|
||||
if (val_len != 0)
|
||||
return -EIO;
|
||||
|
||||
if (!valid_free_extent_key(key))
|
||||
return -EIO;
|
||||
|
||||
if (key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
|
||||
return -ECHECK_ITER_DONE;
|
||||
|
||||
start = le64_to_cpu(key->skfb_end) - le64_to_cpu(key->skfb_len) + 1;
|
||||
len = le64_to_cpu(key->skfb_len);
|
||||
|
||||
return ecba->cb(start, len, ecba->cb_arg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Call the callback with each of the primary BLKNO free extents stored
|
||||
* in item in the given alloc root. It doesn't visit the secondary
|
||||
* ORDER extents.
|
||||
*/
|
||||
int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
|
||||
{
|
||||
struct extent_cb_arg_t ecba = { .cb = cb, .cb_arg = cb_arg };
|
||||
|
||||
return btree_item_iter(&root->root, free_item_cb, &ecba);
|
||||
}
|
||||
12
utils/src/check/alloc.h
Normal file
12
utils/src/check/alloc.h
Normal file
@@ -0,0 +1,12 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_ALLOC_H
|
||||
#define _SCOUTFS_UTILS_CHECK_ALLOC_H
|
||||
|
||||
#include "extent.h"
|
||||
|
||||
int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
|
||||
int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
|
||||
|
||||
int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
|
||||
int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
|
||||
|
||||
#endif
|
||||
613
utils/src/check/block.c
Normal file
613
utils/src/check/block.c
Normal file
@@ -0,0 +1,613 @@
|
||||
#define _ISOC11_SOURCE /* aligned_alloc */
|
||||
#define _DEFAULT_SOURCE /* syscall() */
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <linux/aio_abi.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "list.h"
|
||||
#include "cmp.h"
|
||||
#include "hash.h"
|
||||
|
||||
#include "block.h"
|
||||
#include "debug.h"
|
||||
#include "super.h"
|
||||
#include "eno.h"
|
||||
#include "crc.h"
|
||||
#include "sns.h"
|
||||
|
||||
static struct block_data {
|
||||
struct list_head *hash_lists;
|
||||
size_t hash_nr;
|
||||
|
||||
struct list_head active_head;
|
||||
struct list_head inactive_head;
|
||||
struct list_head dirty_list;
|
||||
size_t nr_active;
|
||||
size_t nr_inactive;
|
||||
size_t nr_dirty;
|
||||
|
||||
int meta_fd;
|
||||
size_t max_cached;
|
||||
size_t nr_events;
|
||||
|
||||
aio_context_t ctx;
|
||||
struct iocb *iocbs;
|
||||
struct iocb **iocbps;
|
||||
struct io_event *events;
|
||||
} global_bdat;
|
||||
|
||||
struct block {
|
||||
struct list_head hash_head;
|
||||
struct list_head lru_head;
|
||||
struct list_head dirty_head;
|
||||
struct list_head submit_head;
|
||||
unsigned long refcount;
|
||||
unsigned long uptodate:1,
|
||||
active:1;
|
||||
u64 blkno;
|
||||
void *buf;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
#define BLK_FMT \
|
||||
"blkno %llu rc %ld d %u a %u"
|
||||
#define BLK_ARG(blk) \
|
||||
(blk)->blkno, (blk)->refcount, !list_empty(&(blk)->dirty_head), blk->active
|
||||
#define debug_blk(blk, fmt, args...) \
|
||||
debug(fmt " " BLK_FMT, ##args, BLK_ARG(blk))
|
||||
|
||||
/*
|
||||
* This just allocates and initialzies the block. The caller is
|
||||
* responsible for putting it on the appropriate initial lists and
|
||||
* managing refcounts.
|
||||
*/
|
||||
static struct block *alloc_block(struct block_data *bdat, u64 blkno, size_t size)
|
||||
{
|
||||
struct block *blk;
|
||||
|
||||
blk = calloc(1, sizeof(struct block));
|
||||
if (blk) {
|
||||
blk->buf = aligned_alloc(4096, size); /* XXX static alignment :/ */
|
||||
if (!blk->buf) {
|
||||
free(blk);
|
||||
blk = NULL;
|
||||
} else {
|
||||
INIT_LIST_HEAD(&blk->hash_head);
|
||||
INIT_LIST_HEAD(&blk->lru_head);
|
||||
INIT_LIST_HEAD(&blk->dirty_head);
|
||||
INIT_LIST_HEAD(&blk->submit_head);
|
||||
blk->blkno = blkno;
|
||||
blk->size = size;
|
||||
}
|
||||
}
|
||||
|
||||
return blk;
|
||||
}
|
||||
|
||||
static void free_block(struct block_data *bdat, struct block *blk)
|
||||
{
|
||||
debug_blk(blk, "free");
|
||||
|
||||
if (!list_empty(&blk->lru_head)) {
|
||||
if (blk->active)
|
||||
bdat->nr_active--;
|
||||
else
|
||||
bdat->nr_inactive--;
|
||||
list_del(&blk->lru_head);
|
||||
}
|
||||
|
||||
if (!list_empty(&blk->dirty_head)) {
|
||||
bdat->nr_dirty--;
|
||||
list_del(&blk->dirty_head);
|
||||
}
|
||||
|
||||
if (!list_empty(&blk->hash_head))
|
||||
list_del(&blk->hash_head);
|
||||
|
||||
if (!list_empty(&blk->submit_head))
|
||||
list_del(&blk->submit_head);
|
||||
|
||||
free(blk->buf);
|
||||
free(blk);
|
||||
}
|
||||
|
||||
static bool blk_is_dirty(struct block *blk)
|
||||
{
|
||||
return !list_empty(&blk->dirty_head);
|
||||
}
|
||||
|
||||
/*
|
||||
* Rebalance the cache.
|
||||
*
|
||||
* First we shrink the cache to limit it to max_cached blocks.
|
||||
* Logically, we walk from oldest to newest in the inactive list and
|
||||
* then in the active list. Since these lists are physically one
|
||||
* list_head list we achieve this with a reverse walk starting from the
|
||||
* active head.
|
||||
*
|
||||
* Then we rebalnace the size of the two lists. The constraint is that
|
||||
* we don't let the active list grow larger than the inactive list. We
|
||||
* move blocks from the oldest tail of the active list to the newest
|
||||
* head of the inactive list.
|
||||
*
|
||||
* <- [active head] <-> [ .. active list .. ] <-> [inactive head] <-> [ .. inactive list .. ] ->
|
||||
*/
|
||||
static void rebalance_cache(struct block_data *bdat)
|
||||
{
|
||||
struct block *blk;
|
||||
struct block *blk_;
|
||||
|
||||
list_for_each_entry_safe_reverse(blk, blk_, &bdat->active_head, lru_head) {
|
||||
if ((bdat->nr_active + bdat->nr_inactive) < bdat->max_cached)
|
||||
break;
|
||||
|
||||
if (&blk->lru_head == &bdat->inactive_head || blk->refcount > 0 ||
|
||||
blk_is_dirty(blk))
|
||||
continue;
|
||||
|
||||
free_block(bdat, blk);
|
||||
}
|
||||
|
||||
list_for_each_entry_safe_reverse(blk, blk_, &bdat->inactive_head, lru_head) {
|
||||
if (bdat->nr_active <= bdat->nr_inactive || &blk->lru_head == &bdat->active_head)
|
||||
break;
|
||||
|
||||
list_move(&blk->lru_head, &bdat->inactive_head);
|
||||
blk->active = 0;
|
||||
bdat->nr_active--;
|
||||
bdat->nr_inactive++;
|
||||
}
|
||||
}
|
||||
|
||||
static void make_active(struct block_data *bdat, struct block *blk)
|
||||
{
|
||||
if (!blk->active) {
|
||||
if (!list_empty(&blk->lru_head)) {
|
||||
list_move(&blk->lru_head, &bdat->active_head);
|
||||
bdat->nr_inactive--;
|
||||
} else {
|
||||
list_add(&blk->lru_head, &bdat->active_head);
|
||||
}
|
||||
|
||||
blk->active = 1;
|
||||
bdat->nr_active++;
|
||||
}
|
||||
}
|
||||
|
||||
static int compar_iocbp(const void *A, const void *B)
|
||||
{
|
||||
struct iocb *a = *(struct iocb **)A;
|
||||
struct iocb *b = *(struct iocb **)B;
|
||||
|
||||
return scoutfs_cmp(a->aio_offset, b->aio_offset);
|
||||
}
|
||||
|
||||
static int submit_and_wait(struct block_data *bdat, struct list_head *list)
|
||||
{
|
||||
struct io_event *event;
|
||||
struct iocb *iocb;
|
||||
struct block *blk;
|
||||
int ret;
|
||||
int err;
|
||||
int nr;
|
||||
int i;
|
||||
|
||||
err = 0;
|
||||
nr = 0;
|
||||
list_for_each_entry(blk, list, submit_head) {
|
||||
iocb = &bdat->iocbs[nr];
|
||||
bdat->iocbps[nr] = iocb;
|
||||
|
||||
memset(iocb, 0, sizeof(struct iocb));
|
||||
|
||||
iocb->aio_data = (intptr_t)blk;
|
||||
iocb->aio_lio_opcode = blk_is_dirty(blk) ? IOCB_CMD_PWRITE : IOCB_CMD_PREAD;
|
||||
iocb->aio_fildes = bdat->meta_fd;
|
||||
iocb->aio_buf = (intptr_t)blk->buf;
|
||||
iocb->aio_nbytes = blk->size;
|
||||
iocb->aio_offset = blk->blkno * blk->size;
|
||||
|
||||
nr++;
|
||||
|
||||
debug_blk(blk, "submit");
|
||||
|
||||
if ((nr < bdat->nr_events) && blk->submit_head.next != list)
|
||||
continue;
|
||||
|
||||
qsort(bdat->iocbps, nr, sizeof(bdat->iocbps[0]), compar_iocbp);
|
||||
|
||||
ret = syscall(__NR_io_submit, bdat->ctx, nr, bdat->iocbps);
|
||||
if (ret != nr) {
|
||||
if (ret >= 0)
|
||||
errno = EIO;
|
||||
ret = -errno;
|
||||
fprintf(stderr, "fatal system error submitting async IO: "ENO_FMT"\n",
|
||||
ENO_ARG(-ret));
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = syscall(__NR_io_getevents, bdat->ctx, nr, nr, bdat->events, NULL);
|
||||
if (ret != nr) {
|
||||
if (ret >= 0)
|
||||
errno = EIO;
|
||||
ret = -errno;
|
||||
fprintf(stderr, "fatal system error getting IO events: "ENO_FMT"\n",
|
||||
ENO_ARG(-ret));
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
for (i = 0; i < nr; i++) {
|
||||
event = &bdat->events[i];
|
||||
iocb = (struct iocb *)(intptr_t)event->obj;
|
||||
blk = (struct block *)(intptr_t)event->data;
|
||||
|
||||
debug_blk(blk, "complete res %lld", (long long)event->res);
|
||||
|
||||
if (event->res >= 0 && event->res != blk->size)
|
||||
event->res = -EIO;
|
||||
|
||||
/* io errors are fatal */
|
||||
if (event->res < 0) {
|
||||
ret = event->res;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (iocb->aio_lio_opcode == IOCB_CMD_PREAD) {
|
||||
blk->uptodate = 1;
|
||||
} else {
|
||||
list_del_init(&blk->dirty_head);
|
||||
bdat->nr_dirty--;
|
||||
}
|
||||
}
|
||||
nr = 0;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret ?: err;
|
||||
}
|
||||
|
||||
static void inc_refcount(struct block *blk)
|
||||
{
|
||||
blk->refcount++;
|
||||
}
|
||||
|
||||
void block_put(struct block **blkp)
|
||||
{
|
||||
struct block_data *bdat = &global_bdat;
|
||||
struct block *blk = *blkp;
|
||||
|
||||
if (blk) {
|
||||
blk->refcount--;
|
||||
*blkp = NULL;
|
||||
|
||||
rebalance_cache(bdat);
|
||||
}
|
||||
}
|
||||
|
||||
static struct list_head *hash_bucket(struct block_data *bdat, u64 blkno)
|
||||
{
|
||||
u32 hash = scoutfs_hash32(&blkno, sizeof(blkno));
|
||||
|
||||
return &bdat->hash_lists[hash % bdat->hash_nr];
|
||||
}
|
||||
|
||||
int block_hdr_valid(struct block *blk, u64 blkno, int bf, u32 magic)
|
||||
{
|
||||
struct scoutfs_block_header *hdr;
|
||||
size_t size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE;
|
||||
int ret;
|
||||
u32 crc;
|
||||
|
||||
ret = block_get(&blk, blkno, bf);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "error reading block %llu\n", blkno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
hdr = block_buf(blk);
|
||||
|
||||
crc = crc_block(hdr, size);
|
||||
|
||||
/*
|
||||
* a bad CRC is easy to repair, so we pass a different error code
|
||||
* back. Unless the other data is also wrong - then it's EINVAL
|
||||
* to signal that this isn't a valid block hdr at all.
|
||||
*/
|
||||
if (le32_to_cpu(hdr->crc) != crc)
|
||||
ret = -EIO; /* keep checking other fields */
|
||||
|
||||
if (le32_to_cpu(hdr->magic) != magic)
|
||||
ret = -EINVAL;
|
||||
|
||||
/*
|
||||
* Our first caller fills in global_super. Until this completes,
|
||||
* we can't do this check.
|
||||
*/
|
||||
if ((blkno != SCOUTFS_SUPER_BLKNO) &&
|
||||
(hdr->fsid != global_super->hdr.fsid))
|
||||
ret = -EINVAL;
|
||||
|
||||
block_put(&blk);
|
||||
|
||||
debug("%s blk_hdr_valid blkno %llu size %lu crc 0x%08x magic 0x%08x ret %d",
|
||||
sns_str(), blkno, size, le32_to_cpu(hdr->crc), le32_to_cpu(hdr->magic),
|
||||
ret);
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static struct block *get_or_alloc(struct block_data *bdat, u64 blkno, int bf)
|
||||
{
|
||||
struct list_head *bucket = hash_bucket(bdat, blkno);
|
||||
struct block *search;
|
||||
struct block *blk;
|
||||
size_t size;
|
||||
|
||||
size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE;
|
||||
|
||||
blk = NULL;
|
||||
list_for_each_entry(search, bucket, hash_head) {
|
||||
if (search->blkno == blkno && search->size == size) {
|
||||
blk = search;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!blk) {
|
||||
blk = alloc_block(bdat, blkno, size);
|
||||
if (blk) {
|
||||
list_add(&blk->hash_head, bucket);
|
||||
list_add(&blk->lru_head, &bdat->inactive_head);
|
||||
bdat->nr_inactive++;
|
||||
}
|
||||
}
|
||||
if (blk)
|
||||
inc_refcount(blk);
|
||||
|
||||
return blk;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get a block.
|
||||
*
|
||||
* The caller holds a refcount to the block while it's in use that
|
||||
* prevents it from being removed from the cache. It must be dropped
|
||||
* with block_put();
|
||||
*/
|
||||
int block_get(struct block **blk_ret, u64 blkno, int bf)
|
||||
{
|
||||
struct block_data *bdat = &global_bdat;
|
||||
struct block *blk;
|
||||
LIST_HEAD(list);
|
||||
int ret;
|
||||
|
||||
blk = get_or_alloc(bdat, blkno, bf);
|
||||
if (!blk) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((bf & BF_ZERO)) {
|
||||
memset(blk->buf, 0, blk->size);
|
||||
blk->uptodate = 1;
|
||||
}
|
||||
|
||||
if (bf & BF_OVERWRITE)
|
||||
blk->uptodate = 1;
|
||||
|
||||
if (!blk->uptodate) {
|
||||
list_add(&blk->submit_head, &list);
|
||||
ret = submit_and_wait(bdat, &list);
|
||||
list_del_init(&blk->submit_head);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if ((bf & BF_DIRTY) && !blk_is_dirty(blk)) {
|
||||
list_add_tail(&bdat->dirty_list, &blk->dirty_head);
|
||||
bdat->nr_dirty++;
|
||||
}
|
||||
|
||||
make_active(bdat, blk);
|
||||
|
||||
rebalance_cache(bdat);
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret < 0)
|
||||
block_put(&blk);
|
||||
*blk_ret = blk;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void *block_buf(struct block *blk)
|
||||
{
|
||||
return blk->buf;
|
||||
}
|
||||
|
||||
size_t block_size(struct block *blk)
|
||||
{
|
||||
return blk->size;
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop the block from the cache, regardless of if it was free or not.
|
||||
* This is used to avoid writing blocks which were dirtied but then
|
||||
* later freed.
|
||||
*
|
||||
* The block is immediately freed and can't be referenced after this
|
||||
* returns.
|
||||
*/
|
||||
void block_drop(struct block **blkp)
|
||||
{
|
||||
struct block_data *bdat = &global_bdat;
|
||||
|
||||
free_block(bdat, *blkp);
|
||||
*blkp = NULL;
|
||||
rebalance_cache(bdat);
|
||||
}
|
||||
|
||||
/*
|
||||
* This doesn't quite work for mixing large and small blocks, but that's
|
||||
* fine, we never do that.
|
||||
*/
|
||||
static int compar_u64(const void *A, const void *B)
|
||||
{
|
||||
u64 a = *((u64 *)A);
|
||||
u64 b = *((u64 *)B);
|
||||
|
||||
return scoutfs_cmp(a, b);
|
||||
}
|
||||
|
||||
/*
|
||||
* This read-ahead is synchronous and errors are ignored. If any of the
|
||||
* blknos aren't present in the cache then we issue concurrent reads for
|
||||
* them and wait. Any existing cached blocks will be left as is.
|
||||
*
|
||||
* We might be trying to read a lot more than the number of events so we
|
||||
* sort the caller's blknos before iterating over them rather than
|
||||
* relying on submission sorting the blocks in each submitted set.
|
||||
*/
|
||||
void block_readahead(u64 *blknos, size_t nr)
|
||||
{
|
||||
struct block_data *bdat = &global_bdat;
|
||||
struct block *blk;
|
||||
struct block *blk_;
|
||||
LIST_HEAD(list);
|
||||
size_t i;
|
||||
|
||||
if (nr == 0)
|
||||
return;
|
||||
|
||||
qsort(blknos, nr, sizeof(blknos[0]), compar_u64);
|
||||
|
||||
for (i = 0; i < nr; i++) {
|
||||
blk = get_or_alloc(bdat, blknos[i], 0);
|
||||
if (blk) {
|
||||
if (!blk->uptodate)
|
||||
list_add_tail(&blk->submit_head, &list);
|
||||
else
|
||||
block_put(&blk);
|
||||
}
|
||||
}
|
||||
|
||||
(void)submit_and_wait(bdat, &list);
|
||||
|
||||
list_for_each_entry_safe(blk, blk_, &list, submit_head) {
|
||||
list_del_init(&blk->submit_head);
|
||||
block_put(&blk);
|
||||
}
|
||||
|
||||
rebalance_cache(bdat);
|
||||
}
|
||||
|
||||
/*
|
||||
* The caller's block changes form a consistent transaction. If the amount of dirty
|
||||
* blocks is large enough we issue a write.
|
||||
*/
|
||||
int block_try_commit(bool force)
|
||||
{
|
||||
struct block_data *bdat = &global_bdat;
|
||||
struct block *blk;
|
||||
struct block *blk_;
|
||||
LIST_HEAD(list);
|
||||
int ret;
|
||||
|
||||
if (!force && bdat->nr_dirty < bdat->nr_events)
|
||||
return 0;
|
||||
|
||||
list_for_each_entry(blk, &bdat->dirty_list, dirty_head) {
|
||||
list_add_tail(&blk->submit_head, &list);
|
||||
inc_refcount(blk);
|
||||
}
|
||||
|
||||
ret = submit_and_wait(bdat, &list);
|
||||
|
||||
list_for_each_entry_safe(blk, blk_, &list, submit_head) {
|
||||
list_del_init(&blk->submit_head);
|
||||
block_put(&blk);
|
||||
}
|
||||
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "error writing dirty transaction blocks\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_OVERWRITE | BF_DIRTY);
|
||||
if (ret == 0) {
|
||||
list_add(&blk->submit_head, &list);
|
||||
ret = submit_and_wait(bdat, &list);
|
||||
list_del_init(&blk->submit_head);
|
||||
block_put(&blk);
|
||||
} else {
|
||||
ret = -ENOMEM;
|
||||
}
|
||||
if (ret < 0)
|
||||
fprintf(stderr, "error writing super block to commit transaction\n");
|
||||
|
||||
out:
|
||||
rebalance_cache(bdat);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes)
|
||||
{
|
||||
struct block_data *bdat = &global_bdat;
|
||||
size_t i;
|
||||
int ret;
|
||||
|
||||
bdat->max_cached = DIV_ROUND_UP(max_cached_bytes, SCOUTFS_BLOCK_LG_SIZE);
|
||||
bdat->hash_nr = bdat->max_cached / 4;
|
||||
bdat->nr_events = DIV_ROUND_UP(max_dirty_bytes, SCOUTFS_BLOCK_LG_SIZE);
|
||||
|
||||
bdat->iocbs = calloc(bdat->nr_events, sizeof(bdat->iocbs[0]));
|
||||
bdat->iocbps = calloc(bdat->nr_events, sizeof(bdat->iocbps[0]));
|
||||
bdat->events = calloc(bdat->nr_events, sizeof(bdat->events[0]));
|
||||
bdat->hash_lists = calloc(bdat->hash_nr, sizeof(bdat->hash_lists[0]));
|
||||
if (!bdat->iocbs || !bdat->iocbps || !bdat->events || !bdat->hash_lists) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&bdat->active_head);
|
||||
INIT_LIST_HEAD(&bdat->inactive_head);
|
||||
INIT_LIST_HEAD(&bdat->dirty_list);
|
||||
bdat->meta_fd = meta_fd;
|
||||
list_add(&bdat->inactive_head, &bdat->active_head);
|
||||
|
||||
for (i = 0; i < bdat->hash_nr; i++)
|
||||
INIT_LIST_HEAD(&bdat->hash_lists[i]);
|
||||
|
||||
ret = syscall(__NR_io_setup, bdat->nr_events, &bdat->ctx);
|
||||
|
||||
out:
|
||||
if (ret < 0) {
|
||||
free(bdat->iocbs);
|
||||
free(bdat->iocbps);
|
||||
free(bdat->events);
|
||||
free(bdat->hash_lists);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void block_shutdown(void)
|
||||
{
|
||||
struct block_data *bdat = &global_bdat;
|
||||
|
||||
syscall(SYS_io_destroy, bdat->ctx);
|
||||
|
||||
free(bdat->iocbs);
|
||||
free(bdat->iocbps);
|
||||
free(bdat->events);
|
||||
free(bdat->hash_lists);
|
||||
}
|
||||
34
utils/src/check/block.h
Normal file
34
utils/src/check/block.h
Normal file
@@ -0,0 +1,34 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_BLOCK_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_BLOCK_H_
|
||||
|
||||
#include <unistd.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
struct block;
|
||||
|
||||
#include "sparse.h"
|
||||
|
||||
/* block flags passed to block_get() */
|
||||
enum {
|
||||
BF_ZERO = (1 << 0), /* zero contents buf as block is returned */
|
||||
BF_DIRTY = (1 << 1), /* block will be written with transaction */
|
||||
BF_SM = (1 << 2), /* small 4k block instead of large 64k block */
|
||||
BF_OVERWRITE = (1 << 3), /* caller will overwrite contents, don't read */
|
||||
};
|
||||
|
||||
int block_get(struct block **blk_ret, u64 blkno, int bf);
|
||||
void block_put(struct block **blkp);
|
||||
|
||||
void *block_buf(struct block *blk);
|
||||
size_t block_size(struct block *blk);
|
||||
void block_drop(struct block **blkp);
|
||||
|
||||
void block_readahead(u64 *blknos, size_t nr);
|
||||
int block_try_commit(bool force);
|
||||
|
||||
int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes);
|
||||
void block_shutdown(void);
|
||||
|
||||
int block_hdr_valid(struct block *blk, u64 blkno, int bf, u32 magic);
|
||||
|
||||
#endif
|
||||
217
utils/src/check/btree.c
Normal file
217
utils/src/check/btree.c
Normal file
@@ -0,0 +1,217 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "key.h"
|
||||
#include "avl.h"
|
||||
|
||||
#include "block.h"
|
||||
#include "btree.h"
|
||||
#include "extent.h"
|
||||
#include "iter.h"
|
||||
#include "sns.h"
|
||||
#include "meta.h"
|
||||
#include "problem.h"
|
||||
|
||||
static inline void *item_val(struct scoutfs_btree_block *bt, struct scoutfs_btree_item *item)
|
||||
{
|
||||
return (void *)bt + le16_to_cpu(item->val_off);
|
||||
}
|
||||
|
||||
static void readahead_refs(struct scoutfs_btree_block *bt)
|
||||
{
|
||||
struct scoutfs_btree_item *item;
|
||||
struct scoutfs_avl_node *node;
|
||||
struct scoutfs_block_ref *ref;
|
||||
u64 *blknos;
|
||||
u64 blkno;
|
||||
u16 valid = 0;
|
||||
u16 nr = le16_to_cpu(bt->nr_items);
|
||||
int i;
|
||||
|
||||
blknos = calloc(nr, sizeof(blknos[0]));
|
||||
if (!blknos)
|
||||
return;
|
||||
|
||||
node = avl_first(&bt->item_root);
|
||||
|
||||
for (i = 0; i < nr; i++) {
|
||||
item = container_of(node, struct scoutfs_btree_item, node);
|
||||
ref = item_val(bt, item);
|
||||
blkno = le64_to_cpu(ref->blkno);
|
||||
|
||||
if (valid_meta_blkno(blkno))
|
||||
blknos[valid++] = blkno;
|
||||
|
||||
node = avl_next(&bt->item_root, &item->node);
|
||||
}
|
||||
|
||||
if (valid > 0)
|
||||
block_readahead(blknos, valid);
|
||||
free(blknos);
|
||||
}
|
||||
|
||||
/*
|
||||
* Call the callback on the referenced block. Then if the block
|
||||
* contains referneces read it and recurse into all its references.
|
||||
*/
|
||||
static int btree_ref_meta_iter(struct scoutfs_block_ref *ref, unsigned level, extent_cb_t cb,
|
||||
void *cb_arg)
|
||||
{
|
||||
struct scoutfs_btree_item *item;
|
||||
struct scoutfs_btree_block *bt;
|
||||
struct scoutfs_avl_node *node;
|
||||
struct block *blk = NULL;
|
||||
u64 blkno;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
blkno = le64_to_cpu(ref->blkno);
|
||||
if (!blkno)
|
||||
return 0;
|
||||
|
||||
ret = cb(blkno, 1, cb_arg);
|
||||
if (ret < 0) {
|
||||
ret = xlate_iter_errno(ret);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (level == 0)
|
||||
return 0;
|
||||
|
||||
ret = block_get(&blk, blkno, 0);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_BTREE);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
sns_push("btree_parent", blkno, 0);
|
||||
|
||||
bt = block_buf(blk);
|
||||
|
||||
/* XXX integrate verification with block cache */
|
||||
if (bt->level != level) {
|
||||
problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* read-ahead last level of parents */
|
||||
if (level == 2)
|
||||
readahead_refs(bt);
|
||||
|
||||
node = avl_first(&bt->item_root);
|
||||
|
||||
for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
|
||||
item = container_of(node, struct scoutfs_btree_item, node);
|
||||
ref = item_val(bt, item);
|
||||
|
||||
ret = btree_ref_meta_iter(ref, level - 1, cb, cb_arg);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
node = avl_next(&bt->item_root, &item->node);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
block_put(&blk);
|
||||
sns_pop();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg)
|
||||
{
|
||||
/* XXX check root */
|
||||
if (root->height == 0)
|
||||
return 0;
|
||||
|
||||
return btree_ref_meta_iter(&root->ref, root->height - 1, cb, cb_arg);
|
||||
}
|
||||
|
||||
static int btree_ref_item_iter(struct scoutfs_block_ref *ref, unsigned level,
|
||||
btree_item_cb_t cb, void *cb_arg)
|
||||
{
|
||||
struct scoutfs_btree_item *item;
|
||||
struct scoutfs_btree_block *bt;
|
||||
struct scoutfs_avl_node *node;
|
||||
struct block *blk = NULL;
|
||||
u64 blkno;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
blkno = le64_to_cpu(ref->blkno);
|
||||
if (!blkno)
|
||||
return 0;
|
||||
|
||||
ret = block_get(&blk, blkno, 0);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (level)
|
||||
sns_push("btree_parent", blkno, 0);
|
||||
else
|
||||
sns_push("btree_leaf", blkno, 0);
|
||||
|
||||
ret = block_hdr_valid(blk, blkno, 0, SCOUTFS_BLOCK_MAGIC_BTREE);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
bt = block_buf(blk);
|
||||
|
||||
/* XXX integrate verification with block cache */
|
||||
if (bt->level != level) {
|
||||
problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* read-ahead leaves that contain items */
|
||||
if (level == 1)
|
||||
readahead_refs(bt);
|
||||
|
||||
node = avl_first(&bt->item_root);
|
||||
|
||||
for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
|
||||
item = container_of(node, struct scoutfs_btree_item, node);
|
||||
|
||||
if (level) {
|
||||
ref = item_val(bt, item);
|
||||
ret = btree_ref_item_iter(ref, level - 1, cb, cb_arg);
|
||||
} else {
|
||||
ret = cb(&item->key, item_val(bt, item),
|
||||
le16_to_cpu(item->val_len), cb_arg);
|
||||
debug("free item key "SK_FMT" ret %d", SK_ARG(&item->key), ret);
|
||||
}
|
||||
if (ret < 0) {
|
||||
ret = xlate_iter_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
node = avl_next(&bt->item_root, &item->node);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
block_put(&blk);
|
||||
sns_pop();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg)
|
||||
{
|
||||
/* XXX check root */
|
||||
if (root->height == 0)
|
||||
return 0;
|
||||
|
||||
return btree_ref_item_iter(&root->ref, root->height - 1, cb, cb_arg);
|
||||
}
|
||||
14
utils/src/check/btree.h
Normal file
14
utils/src/check/btree.h
Normal file
@@ -0,0 +1,14 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_BTREE_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_BTREE_H_
|
||||
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
|
||||
#include "extent.h"
|
||||
|
||||
typedef int (*btree_item_cb_t)(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg);
|
||||
|
||||
int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg);
|
||||
int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg);
|
||||
|
||||
#endif
|
||||
184
utils/src/check/check.c
Normal file
184
utils/src/check/check.c
Normal file
@@ -0,0 +1,184 @@
|
||||
#define _GNU_SOURCE /* O_DIRECT */
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include <stdbool.h>
|
||||
#include <argp.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "parse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "ioctl.h"
|
||||
#include "cmd.h"
|
||||
#include "dev.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "block.h"
|
||||
#include "debug.h"
|
||||
#include "meta.h"
|
||||
#include "super.h"
|
||||
#include "problem.h"
|
||||
|
||||
struct check_args {
|
||||
char *meta_device;
|
||||
char *data_device;
|
||||
char *debug_path;
|
||||
};
|
||||
|
||||
static int do_check(struct check_args *args)
|
||||
{
|
||||
int debug_fd = -1;
|
||||
int meta_fd = -1;
|
||||
int data_fd = -1;
|
||||
int ret;
|
||||
|
||||
if (args->debug_path) {
|
||||
if (strcmp(args->debug_path, "-") == 0)
|
||||
debug_fd = dup(STDERR_FILENO);
|
||||
else
|
||||
debug_fd = open(args->debug_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
|
||||
if (debug_fd < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "error opening debug output file '%s': %s (%d)\n",
|
||||
args->debug_path, strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
debug_enable(debug_fd);
|
||||
}
|
||||
|
||||
meta_fd = open(args->meta_device, O_DIRECT | O_RDWR | O_EXCL);
|
||||
if (meta_fd < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "failed to open meta device '%s': %s (%d)\n",
|
||||
args->meta_device, strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
data_fd = open(args->data_device, O_DIRECT | O_RDWR | O_EXCL);
|
||||
if (data_fd < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "failed to open data device '%s': %s (%d)\n",
|
||||
args->data_device, strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* At some point we may convert this to a multi-pass system where we may
|
||||
* try and repair items, and, as long as repairs are made, we will rerun
|
||||
* the checks more times. We may need to start counting how many problems we
|
||||
* fix in the process of these loops, so that we don't stall on unrepairable
|
||||
* problems and are making actual repair progress. IOW - when we do a full
|
||||
* check loop without any problems fixed, we stop trying.
|
||||
*/
|
||||
ret = check_supers(data_fd) ?:
|
||||
check_super_in_use(meta_fd) ?:
|
||||
check_meta_alloc() ?:
|
||||
check_super_crc();
|
||||
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
debug("problem count %lu", problems_count());
|
||||
if (problems_count() > 0)
|
||||
printf("Problems detected.\n");
|
||||
|
||||
out:
|
||||
/* and tear it all down */
|
||||
block_shutdown();
|
||||
super_shutdown();
|
||||
debug_disable();
|
||||
|
||||
if (meta_fd >= 0)
|
||||
close(meta_fd);
|
||||
if (data_fd >= 0)
|
||||
close(data_fd);
|
||||
if (debug_fd >= 0)
|
||||
close(debug_fd);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
{
|
||||
struct check_args *args = state->input;
|
||||
|
||||
switch (key) {
|
||||
case 'd':
|
||||
args->debug_path = strdup_or_error(state, arg);
|
||||
break;
|
||||
case 'e':
|
||||
case ARGP_KEY_ARG:
|
||||
if (!args->meta_device)
|
||||
args->meta_device = strdup_or_error(state, arg);
|
||||
else if (!args->data_device)
|
||||
args->data_device = strdup_or_error(state, arg);
|
||||
else
|
||||
argp_error(state, "more than two device arguments given");
|
||||
break;
|
||||
case ARGP_KEY_FINI:
|
||||
if (!args->meta_device)
|
||||
argp_error(state, "no metadata device argument given");
|
||||
if (!args->data_device)
|
||||
argp_error(state, "no data device argument given");
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct argp_option options[] = {
|
||||
{ "debug", 'd', "FILE_PATH", 0, "Path to debug output file, will be created or truncated"},
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
static struct argp argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
"META-DEVICE DATA-DEVICE",
|
||||
"Check filesystem consistency"
|
||||
};
|
||||
|
||||
/* Exit codes used by fsck-type programs */
|
||||
#define FSCK_EX_NONDESTRUCT 1 /* File system errors corrected */
|
||||
#define FSCK_EX_UNCORRECTED 4 /* File system errors left uncorrected */
|
||||
#define FSCK_EX_ERROR 8 /* Operational error */
|
||||
#define FSCK_EX_USAGE 16 /* Usage or syntax error */
|
||||
|
||||
static int check_cmd(int argc, char **argv)
|
||||
{
|
||||
struct check_args check_args = {NULL};
|
||||
int ret;
|
||||
|
||||
ret = argp_parse(&argp, argc, argv, 0, NULL, &check_args);
|
||||
if (ret)
|
||||
exit(FSCK_EX_USAGE);
|
||||
|
||||
ret = do_check(&check_args);
|
||||
if (ret < 0)
|
||||
ret = FSCK_EX_ERROR;
|
||||
|
||||
if (problems_count() > 0)
|
||||
ret |= FSCK_EX_UNCORRECTED;
|
||||
|
||||
exit(ret);
|
||||
}
|
||||
|
||||
static void __attribute__((constructor)) check_ctor(void)
|
||||
{
|
||||
cmd_register_argp("check", &argp, GROUP_CORE, check_cmd);
|
||||
}
|
||||
16
utils/src/check/debug.c
Normal file
16
utils/src/check/debug.c
Normal file
@@ -0,0 +1,16 @@
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
int debug_fd = -1;
|
||||
|
||||
void debug_enable(int fd)
|
||||
{
|
||||
debug_fd = fd;
|
||||
}
|
||||
|
||||
void debug_disable(void)
|
||||
{
|
||||
if (debug_fd >= 0)
|
||||
debug_fd = -1;
|
||||
}
|
||||
17
utils/src/check/debug.h
Normal file
17
utils/src/check/debug.h
Normal file
@@ -0,0 +1,17 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_DEBUG_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_DEBUG_H_
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#define debug(fmt, args...) \
|
||||
do { \
|
||||
if (debug_fd >= 0) \
|
||||
dprintf(debug_fd, fmt"\n", ##args); \
|
||||
} while (0)
|
||||
|
||||
extern int debug_fd;
|
||||
|
||||
void debug_enable(int fd);
|
||||
void debug_disable(void);
|
||||
|
||||
#endif
|
||||
9
utils/src/check/eno.h
Normal file
9
utils/src/check/eno.h
Normal file
@@ -0,0 +1,9 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_ENO_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_ENO_H_
|
||||
|
||||
#include <errno.h>
|
||||
|
||||
#define ENO_FMT "%d (%s)"
|
||||
#define ENO_ARG(eno) eno, strerror(eno)
|
||||
|
||||
#endif
|
||||
313
utils/src/check/extent.c
Normal file
313
utils/src/check/extent.c
Normal file
@@ -0,0 +1,313 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "lk_rbtree_wrapper.h"
|
||||
|
||||
#include "debug.h"
|
||||
#include "extent.h"
|
||||
|
||||
/*
|
||||
* In-memory extent management in rbtree nodes.
|
||||
*/
|
||||
|
||||
bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len)
|
||||
{
|
||||
u64 a_end = a_start + a_len;
|
||||
u64 b_end = b_start + b_len;
|
||||
|
||||
return !((a_end <= b_start) || (b_end <= a_start));
|
||||
}
|
||||
|
||||
static int ext_contains(struct extent_node *ext, u64 start, u64 len)
|
||||
{
|
||||
return ext->start <= start && ext->start + ext->len >= start + len;
|
||||
}
|
||||
|
||||
/*
|
||||
* True if the given extent is bisected by the given range; there's
|
||||
* leftover containing extents on both the left and right sides of the
|
||||
* range in the extent.
|
||||
*/
|
||||
static int ext_bisected(struct extent_node *ext, u64 start, u64 len)
|
||||
{
|
||||
return ext->start < start && ext->start + ext->len > start + len;
|
||||
}
|
||||
|
||||
static struct extent_node *ext_from_rbnode(struct rb_node *rbnode)
|
||||
{
|
||||
return rbnode ? container_of(rbnode, struct extent_node, rbnode) : NULL;
|
||||
}
|
||||
|
||||
static struct extent_node *next_ext(struct extent_node *ext)
|
||||
{
|
||||
return ext ? ext_from_rbnode(rb_next(&ext->rbnode)) : NULL;
|
||||
}
|
||||
|
||||
static struct extent_node *prev_ext(struct extent_node *ext)
|
||||
{
|
||||
return ext ? ext_from_rbnode(rb_prev(&ext->rbnode)) : NULL;
|
||||
}
|
||||
|
||||
struct walk_results {
|
||||
unsigned bisect_to_leaf:1;
|
||||
struct extent_node *found;
|
||||
struct extent_node *next;
|
||||
struct rb_node *parent;
|
||||
struct rb_node **node;
|
||||
};
|
||||
|
||||
static void walk_extents(struct extent_root *root, u64 start, u64 len, struct walk_results *wlk)
|
||||
{
|
||||
struct rb_node **node = &root->rbroot.rb_node;
|
||||
struct extent_node *ext;
|
||||
u64 end = start + len;
|
||||
int cmp;
|
||||
|
||||
wlk->found = NULL;
|
||||
wlk->next = NULL;
|
||||
wlk->parent = NULL;
|
||||
|
||||
while (*node) {
|
||||
wlk->parent = *node;
|
||||
ext = ext_from_rbnode(*node);
|
||||
cmp = end <= ext->start ? -1 :
|
||||
start >= ext->start + ext->len ? 1 : 0;
|
||||
|
||||
if (cmp < 0) {
|
||||
node = &ext->rbnode.rb_left;
|
||||
wlk->next = ext;
|
||||
} else if (cmp > 0) {
|
||||
node = &ext->rbnode.rb_right;
|
||||
} else {
|
||||
wlk->found = ext;
|
||||
if (!(wlk->bisect_to_leaf && ext_bisected(ext, start, len)))
|
||||
break;
|
||||
/* walk right so we can insert greater right from bisection */
|
||||
node = &ext->rbnode.rb_right;
|
||||
}
|
||||
}
|
||||
|
||||
wlk->node = node;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return an extent that overlaps with the given range.
|
||||
*/
|
||||
int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found)
|
||||
{
|
||||
struct walk_results wlk = { 0, };
|
||||
int ret;
|
||||
|
||||
walk_extents(root, start, len, &wlk);
|
||||
if (wlk.found) {
|
||||
memset(found, 0, sizeof(struct extent_node));
|
||||
found->start = wlk.found->start;
|
||||
found->len = wlk.found->len;
|
||||
ret = 0;
|
||||
} else {
|
||||
ret = -ENOENT;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Callers can iterate through direct node references and are entirely
|
||||
* responsible for consistency when doing so.
|
||||
*/
|
||||
struct extent_node *extent_first(struct extent_root *root)
|
||||
{
|
||||
struct walk_results wlk = { 0, };
|
||||
|
||||
walk_extents(root, 0, 1, &wlk);
|
||||
|
||||
return wlk.found ?: wlk.next;
|
||||
}
|
||||
|
||||
struct extent_node *extent_next(struct extent_node *ext)
|
||||
{
|
||||
return next_ext(ext);
|
||||
}
|
||||
|
||||
struct extent_node *extent_prev(struct extent_node *ext)
|
||||
{
|
||||
return prev_ext(ext);
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert a new extent into the tree. We can extend existing nodes,
|
||||
* merge with neighbours, or remove existing extents entirely if we
|
||||
* insert a range that fully spans existing nodes.
|
||||
*/
|
||||
static int walk_insert(struct extent_root *root, u64 start, u64 len, int found_err)
|
||||
{
|
||||
struct walk_results wlk = { 0, };
|
||||
struct extent_node *ext;
|
||||
struct extent_node *nei;
|
||||
int ret;
|
||||
|
||||
walk_extents(root, start, len, &wlk);
|
||||
|
||||
ext = wlk.found;
|
||||
if (ext && found_err) {
|
||||
ret = found_err;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!ext) {
|
||||
ext = malloc(sizeof(struct extent_node));
|
||||
if (!ext) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ext->start = start;
|
||||
ext->len = len;
|
||||
|
||||
rb_link_node(&ext->rbnode, wlk.parent, wlk.node);
|
||||
rb_insert_color(&ext->rbnode, &root->rbroot);
|
||||
}
|
||||
|
||||
/* start by expanding an existing extent if our range is larger */
|
||||
if (start < ext->start) {
|
||||
ext->len += ext->start - start;
|
||||
ext->start = start;
|
||||
}
|
||||
if (ext->start + ext->len < start + len)
|
||||
ext->len += (start + len) - (ext->start + ext->len);
|
||||
|
||||
/* drop any fully spanned neighbors, possibly merging with a final adjacent one */
|
||||
|
||||
while ((nei = prev_ext(ext))) {
|
||||
if (nei->start + nei->len < ext->start)
|
||||
break;
|
||||
|
||||
if (nei->start < ext->start) {
|
||||
ext->len += ext->start - nei->start;
|
||||
ext->start = nei->start;
|
||||
}
|
||||
|
||||
rb_erase(&nei->rbnode, &root->rbroot);
|
||||
free(nei);
|
||||
}
|
||||
|
||||
while ((nei = next_ext(ext))) {
|
||||
if (ext->start + ext->len < nei->start)
|
||||
break;
|
||||
|
||||
if (ext->start + ext->len < nei->start + nei->len)
|
||||
ext->len += (nei->start + nei->len) - (ext->start + ext->len);
|
||||
|
||||
rb_erase(&nei->rbnode, &root->rbroot);
|
||||
free(nei);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret < 0)
|
||||
debug("start %llu len %llu ret %d", start, len, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert a new extent. The specified extent must not overlap with any
|
||||
* existing extents or -EEXIST is returned.
|
||||
*/
|
||||
int extent_insert_new(struct extent_root *root, u64 start, u64 len)
|
||||
{
|
||||
return walk_insert(root, start, len, true);
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert an extent, extending any existing extents that may overlap.
|
||||
*/
|
||||
int extent_insert_extend(struct extent_root *root, u64 start, u64 len)
|
||||
{
|
||||
return walk_insert(root, start, len, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove the specified extent from an existing node. The given extent must be fully
|
||||
* contained in a single node or -ENOENT is returned.
|
||||
*/
|
||||
int extent_remove(struct extent_root *root, u64 start, u64 len)
|
||||
{
|
||||
struct extent_node *ext;
|
||||
struct extent_node *ins;
|
||||
struct walk_results wlk = {
|
||||
.bisect_to_leaf = 1,
|
||||
};
|
||||
int ret;
|
||||
|
||||
walk_extents(root, start, len, &wlk);
|
||||
|
||||
if (!(ext = wlk.found) || !ext_contains(ext, start, len)) {
|
||||
ret = -ENOENT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (ext_bisected(ext, start, len)) {
|
||||
debug("found bisected start %llu len %llu", ext->start, ext->len);
|
||||
ins = malloc(sizeof(struct extent_node));
|
||||
if (!ins) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ins->start = start + len;
|
||||
ins->len = (ext->start + ext->len) - ins->start;
|
||||
|
||||
rb_link_node(&ins->rbnode, wlk.parent, wlk.node);
|
||||
rb_insert_color(&ins->rbnode, &root->rbroot);
|
||||
}
|
||||
|
||||
if (start > ext->start) {
|
||||
ext->len = start - ext->start;
|
||||
} else if (len < ext->len) {
|
||||
ext->start += len;
|
||||
ext->len -= len;
|
||||
} else {
|
||||
rb_erase(&ext->rbnode, &root->rbroot);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
debug("start %llu len %llu ret %d", start, len, ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void extent_root_init(struct extent_root *root)
|
||||
{
|
||||
root->rbroot = RB_ROOT;
|
||||
root->total = 0;
|
||||
}
|
||||
|
||||
void extent_root_free(struct extent_root *root)
|
||||
{
|
||||
struct extent_node *ext;
|
||||
struct rb_node *node;
|
||||
struct rb_node *tmp;
|
||||
|
||||
for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
|
||||
ext = rb_entry(node, struct extent_node, rbnode);
|
||||
rb_erase(&ext->rbnode, &root->rbroot);
|
||||
free(ext);
|
||||
}
|
||||
}
|
||||
|
||||
void extent_root_print(struct extent_root *root)
|
||||
{
|
||||
struct extent_node *ext;
|
||||
struct rb_node *node;
|
||||
struct rb_node *tmp;
|
||||
|
||||
for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
|
||||
ext = rb_entry(node, struct extent_node, rbnode);
|
||||
debug(" start %llu len %llu", ext->start, ext->len);
|
||||
}
|
||||
}
|
||||
38
utils/src/check/extent.h
Normal file
38
utils/src/check/extent.h
Normal file
@@ -0,0 +1,38 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_EXTENT_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_EXTENT_H_
|
||||
|
||||
#include "lk_rbtree_wrapper.h"
|
||||
|
||||
struct extent_root {
|
||||
struct rb_root rbroot;
|
||||
u64 total;
|
||||
};
|
||||
|
||||
struct extent_node {
|
||||
struct rb_node rbnode;
|
||||
u64 start;
|
||||
u64 len;
|
||||
};
|
||||
|
||||
typedef int (*extent_cb_t)(u64 start, u64 len, void *arg);
|
||||
|
||||
struct extent_cb_arg_t {
|
||||
extent_cb_t cb;
|
||||
void *cb_arg;
|
||||
};
|
||||
|
||||
bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len);
|
||||
|
||||
int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found);
|
||||
struct extent_node *extent_first(struct extent_root *root);
|
||||
struct extent_node *extent_next(struct extent_node *ext);
|
||||
struct extent_node *extent_prev(struct extent_node *ext);
|
||||
int extent_insert_new(struct extent_root *root, u64 start, u64 len);
|
||||
int extent_insert_extend(struct extent_root *root, u64 start, u64 len);
|
||||
int extent_remove(struct extent_root *root, u64 start, u64 len);
|
||||
|
||||
void extent_root_init(struct extent_root *root);
|
||||
void extent_root_free(struct extent_root *root);
|
||||
void extent_root_print(struct extent_root *root);
|
||||
|
||||
#endif
|
||||
540
utils/src/check/image.c
Normal file
540
utils/src/check/image.c
Normal file
@@ -0,0 +1,540 @@
|
||||
#define _GNU_SOURCE /* O_DIRECT */
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <stdbool.h>
|
||||
#include <argp.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "bitmap.h"
|
||||
#include "parse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "crc.h"
|
||||
#include "cmd.h"
|
||||
#include "dev.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "block.h"
|
||||
#include "btree.h"
|
||||
#include "log_trees.h"
|
||||
#include "super.h"
|
||||
|
||||
/* huh. */
|
||||
#define OFF_MAX (off_t)((u64)((off_t)~0ULL) >> 1)
|
||||
|
||||
#define SCOUTFS_META_IMAGE_HEADER_MAGIC 0x8aee00d098fa60c5ULL
|
||||
#define SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC 0x70bd5e9269effd86ULL
|
||||
|
||||
struct scoutfs_meta_image_header {
|
||||
__le64 magic;
|
||||
__le64 total_bytes;
|
||||
__le32 version;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_meta_image_block_header {
|
||||
__le64 magic;
|
||||
__le64 offset;
|
||||
__le32 size;
|
||||
__le32 crc;
|
||||
} __packed;
|
||||
|
||||
struct image_args {
|
||||
char *meta_device;
|
||||
bool is_read;
|
||||
bool show_header;
|
||||
u64 ra_window;
|
||||
};
|
||||
|
||||
struct block_bitmaps {
|
||||
unsigned long *bits;
|
||||
u64 size;
|
||||
u64 count;
|
||||
};
|
||||
|
||||
#define errf(fmt, args...) \
|
||||
dprintf(STDERR_FILENO, fmt, ##args)
|
||||
|
||||
static int set_meta_bit(u64 start, u64 len, void *arg)
|
||||
{
|
||||
struct block_bitmaps *bm = arg;
|
||||
int ret;
|
||||
|
||||
if (len != 1) {
|
||||
ret = -EINVAL;
|
||||
} else {
|
||||
if (!test_bit(bm->bits, start)) {
|
||||
set_bit(bm->bits, start);
|
||||
bm->count++;
|
||||
}
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int get_ref_bits(struct block_bitmaps *bm)
|
||||
{
|
||||
struct scoutfs_super_block *super = global_super;
|
||||
int ret;
|
||||
u64 i;
|
||||
|
||||
/*
|
||||
* There are almost no small blocks we need to read, so we read
|
||||
* them as the large blocks that contain them to simplify the
|
||||
* block reading process.
|
||||
*/
|
||||
set_meta_bit(SCOUTFS_SUPER_BLKNO >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
|
||||
|
||||
for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++)
|
||||
set_meta_bit((SCOUTFS_QUORUM_BLKNO + i) >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
|
||||
|
||||
ret = alloc_root_meta_iter(&super->meta_alloc[0], set_meta_bit, bm) ?:
|
||||
alloc_root_meta_iter(&super->meta_alloc[1], set_meta_bit, bm) ?:
|
||||
alloc_root_meta_iter(&super->data_alloc, set_meta_bit, bm) ?:
|
||||
alloc_list_meta_iter(&super->server_meta_avail[0], set_meta_bit, bm) ?:
|
||||
alloc_list_meta_iter(&super->server_meta_avail[1], set_meta_bit, bm) ?:
|
||||
alloc_list_meta_iter(&super->server_meta_freed[0], set_meta_bit, bm) ?:
|
||||
alloc_list_meta_iter(&super->server_meta_freed[1], set_meta_bit, bm) ?:
|
||||
btree_meta_iter(&super->fs_root, set_meta_bit, bm) ?:
|
||||
btree_meta_iter(&super->logs_root, set_meta_bit, bm) ?:
|
||||
btree_meta_iter(&super->log_merge, set_meta_bit, bm) ?:
|
||||
btree_meta_iter(&super->mounted_clients, set_meta_bit, bm) ?:
|
||||
btree_meta_iter(&super->srch_root, set_meta_bit, bm) ?:
|
||||
log_trees_meta_iter(set_meta_bit, bm);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Note that this temporarily modifies the header that it's given.
|
||||
*/
|
||||
static __le32 calc_crc(struct scoutfs_meta_image_block_header *bh, void *buf, size_t size)
|
||||
{
|
||||
__le32 saved = bh->crc;
|
||||
u32 crc = ~0;
|
||||
|
||||
bh->crc = 0;
|
||||
crc = crc32c(crc, bh, sizeof(*bh));
|
||||
crc = crc32c(crc, buf, size);
|
||||
bh->crc = saved;
|
||||
|
||||
return cpu_to_le32(crc);
|
||||
}
|
||||
|
||||
static void printf_header(struct scoutfs_meta_image_header *hdr)
|
||||
{
|
||||
errf("magic: 0x%016llx\n"
|
||||
"total_bytes: %llu\n"
|
||||
"version: %u\n",
|
||||
le64_to_cpu(hdr->magic),
|
||||
le64_to_cpu(hdr->total_bytes),
|
||||
le32_to_cpu(hdr->version));
|
||||
}
|
||||
|
||||
typedef ssize_t (*rw_func_t)(int fd, void *buf, size_t count, off_t offset);
|
||||
|
||||
static inline ssize_t rw_read(int fd, void *buf, size_t count, off_t offset)
|
||||
{
|
||||
return read(fd, buf, count);
|
||||
}
|
||||
|
||||
static inline ssize_t rw_pread(int fd, void *buf, size_t count, off_t offset)
|
||||
{
|
||||
return pread(fd, buf, count, offset);
|
||||
}
|
||||
|
||||
static inline ssize_t rw_write(int fd, void *buf, size_t count, off_t offset)
|
||||
{
|
||||
return write(fd, buf, count);
|
||||
}
|
||||
|
||||
static inline ssize_t rw_pwrite(int fd, void *buf, size_t count, off_t offset)
|
||||
{
|
||||
return pwrite(fd, buf, count, offset);
|
||||
}
|
||||
|
||||
static int rw_full_count(rw_func_t func, u64 *tot, int fd, void *buf, size_t count, off_t offset)
|
||||
{
|
||||
ssize_t sret;
|
||||
|
||||
while (count > 0) {
|
||||
sret = func(fd, buf, count, offset);
|
||||
if (sret <= 0 || sret > count) {
|
||||
if (sret < 0)
|
||||
return -errno;
|
||||
else
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
if (tot)
|
||||
*tot += sret;
|
||||
buf += sret;
|
||||
count -= sret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int read_image(struct image_args *args, int fd, struct block_bitmaps *bm)
|
||||
{
|
||||
struct scoutfs_meta_image_block_header bh;
|
||||
struct scoutfs_meta_image_header hdr;
|
||||
u64 opening;
|
||||
void *buf;
|
||||
off_t off;
|
||||
u64 bit;
|
||||
u64 ra;
|
||||
int ret;
|
||||
|
||||
buf = malloc(SCOUTFS_BLOCK_LG_SIZE);
|
||||
if (!buf) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
hdr.magic = cpu_to_le64(SCOUTFS_META_IMAGE_HEADER_MAGIC);
|
||||
hdr.total_bytes = cpu_to_le64(sizeof(hdr) +
|
||||
(bm->count * (SCOUTFS_BLOCK_LG_SIZE + sizeof(bh))));
|
||||
hdr.version = cpu_to_le32(1);
|
||||
|
||||
if (args->show_header) {
|
||||
printf_header(&hdr);
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &hdr, sizeof(hdr), 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
opening = args->ra_window;
|
||||
ra = 0;
|
||||
bit = 0;
|
||||
|
||||
for (bit = 0; (bit = find_next_set_bit(bm->bits, bit, bm->size)) < bm->size; bit++) {
|
||||
|
||||
/* readahead to open the full window, then a block at a time */
|
||||
do {
|
||||
ra = find_next_set_bit(bm->bits, ra, bm->size);
|
||||
if (ra < bm->size) {
|
||||
off = ra << SCOUTFS_BLOCK_LG_SHIFT;
|
||||
posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_WILLNEED);
|
||||
ra++;
|
||||
if (opening)
|
||||
opening -= min(opening, SCOUTFS_BLOCK_LG_SIZE);
|
||||
}
|
||||
} while (opening > 0);
|
||||
|
||||
off = bit << SCOUTFS_BLOCK_LG_SHIFT;
|
||||
ret = rw_full_count(rw_pread, NULL, fd, buf, SCOUTFS_BLOCK_LG_SIZE, off);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Might as well try to drop the pages we've used to
|
||||
* reduce memory pressure on our read-ahead pages that
|
||||
* are waiting.
|
||||
*/
|
||||
posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_DONTNEED);
|
||||
|
||||
bh.magic = cpu_to_le64(SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC);
|
||||
bh.offset = cpu_to_le64(off);
|
||||
bh.size = cpu_to_le32(SCOUTFS_BLOCK_LG_SIZE);
|
||||
bh.crc = calc_crc(&bh, buf, SCOUTFS_BLOCK_LG_SIZE);
|
||||
|
||||
ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &bh, sizeof(bh), 0) ?:
|
||||
rw_full_count(rw_write, NULL, STDOUT_FILENO, buf, SCOUTFS_BLOCK_LG_SIZE, 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
free(buf);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int invalid_header(struct scoutfs_meta_image_header *hdr)
|
||||
{
|
||||
if (le64_to_cpu(hdr->magic) != SCOUTFS_META_IMAGE_HEADER_MAGIC) {
|
||||
errf("bad image header magic 0x%016llx (!= expected %016llx)\n",
|
||||
le64_to_cpu(hdr->magic), SCOUTFS_META_IMAGE_HEADER_MAGIC);
|
||||
|
||||
} else if (le32_to_cpu(hdr->version) != 1) {
|
||||
errf("unknown image header version %u\n", le32_to_cpu(hdr->version));
|
||||
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
/*
|
||||
* Doesn't catch offset+size overflowing, presumes pwrite() will return
|
||||
* an error.
|
||||
*/
|
||||
static int invalid_block_header(struct scoutfs_meta_image_block_header *bh)
|
||||
{
|
||||
if (le64_to_cpu(bh->magic) != SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC) {
|
||||
errf("bad block header magic 0x%016llx (!= expected %016llx)\n",
|
||||
le64_to_cpu(bh->magic), SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC);
|
||||
|
||||
} else if (le32_to_cpu(bh->size) == 0) {
|
||||
errf("invalid block header size %u\n", le32_to_cpu(bh->size));
|
||||
|
||||
} else if (le32_to_cpu(bh->size) > SIZE_MAX) {
|
||||
errf("block header size %u too large for size_t (> %zu)\n",
|
||||
le32_to_cpu(bh->size), (size_t)SIZE_MAX);
|
||||
|
||||
} else if (le64_to_cpu(bh->offset) > OFF_MAX) {
|
||||
errf("block header offset %llu too large for off_t (> %llu)\n",
|
||||
le64_to_cpu(bh->offset), (u64)OFF_MAX);
|
||||
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
static int write_image(struct image_args *args, int fd, struct block_bitmaps *bm)
|
||||
{
|
||||
struct scoutfs_meta_image_block_header bh;
|
||||
struct scoutfs_meta_image_header hdr;
|
||||
size_t writeback_batch = (2 * 1024 * 1024);
|
||||
size_t buf_size;
|
||||
size_t dirty;
|
||||
size_t size;
|
||||
off_t first;
|
||||
off_t last;
|
||||
off_t off;
|
||||
__le32 calc;
|
||||
void *buf;
|
||||
u64 tot;
|
||||
int ret;
|
||||
|
||||
tot = 0;
|
||||
|
||||
ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &hdr, sizeof(hdr), 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (args->show_header) {
|
||||
printf_header(&hdr);
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = invalid_header(&hdr);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
dirty = 0;
|
||||
first = OFF_MAX;
|
||||
last = 0;
|
||||
buf = NULL;
|
||||
buf_size = 0;
|
||||
|
||||
while (tot < le64_to_cpu(hdr.total_bytes)) {
|
||||
|
||||
ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &bh, sizeof(bh), 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = invalid_block_header(&bh);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
size = le32_to_cpu(bh.size);
|
||||
if (buf_size < size) {
|
||||
buf = realloc(buf, size);
|
||||
if (!buf) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
buf_size = size;
|
||||
}
|
||||
|
||||
ret = rw_full_count(rw_read, &tot, STDIN_FILENO, buf, size, 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
calc = calc_crc(&bh, buf, size);
|
||||
if (calc != bh.crc) {
|
||||
errf("crc err");
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
off = le64_to_cpu(bh.offset);
|
||||
|
||||
ret = rw_full_count(rw_pwrite, NULL, fd, buf, size, off);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
dirty += size;
|
||||
first = min(first, off);
|
||||
last = max(last, off);
|
||||
if (dirty >= writeback_batch) {
|
||||
posix_fadvise(fd, first, last, POSIX_FADV_DONTNEED);
|
||||
dirty = 0;
|
||||
first = OFF_MAX;
|
||||
last = 0;
|
||||
}
|
||||
}
|
||||
|
||||
ret = fsync(fd);
|
||||
if (ret < 0) {
|
||||
ret = -errno;
|
||||
goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int do_image(struct image_args *args)
|
||||
{
|
||||
struct block_bitmaps bm = { .bits = NULL };
|
||||
int meta_fd = -1;
|
||||
u64 dev_size;
|
||||
mode_t mode;
|
||||
int ret;
|
||||
|
||||
mode = args->is_read ? O_RDONLY : O_RDWR;
|
||||
|
||||
meta_fd = open(args->meta_device, mode);
|
||||
if (meta_fd < 0) {
|
||||
ret = -errno;
|
||||
errf("failed to open meta device '%s': %s (%d)\n",
|
||||
args->meta_device, strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (args->is_read) {
|
||||
ret = flush_device(meta_fd);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = get_device_size(args->meta_device, meta_fd, &dev_size);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
bm.size = DIV_ROUND_UP(dev_size, SCOUTFS_BLOCK_LG_SIZE);
|
||||
bm.bits = calloc(1, round_up(bm.size, BITS_PER_LONG) / 8);
|
||||
if (!bm.bits) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024) ?:
|
||||
check_supers(-1) ?:
|
||||
get_ref_bits(&bm) ?:
|
||||
read_image(args, meta_fd, &bm);
|
||||
block_shutdown();
|
||||
} else {
|
||||
ret = write_image(args, meta_fd, &bm);
|
||||
}
|
||||
out:
|
||||
free(bm.bits);
|
||||
|
||||
if (meta_fd >= 0)
|
||||
close(meta_fd);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
{
|
||||
struct image_args *args = state->input;
|
||||
int ret;
|
||||
|
||||
switch (key) {
|
||||
case 'h':
|
||||
args->show_header = true;
|
||||
break;
|
||||
case 'r':
|
||||
ret = parse_u64(arg, &args->ra_window);
|
||||
if (ret)
|
||||
argp_error(state, "readahead winddoe parse error");
|
||||
break;
|
||||
case ARGP_KEY_ARG:
|
||||
if (!args->meta_device)
|
||||
args->meta_device = strdup_or_error(state, arg);
|
||||
else
|
||||
argp_error(state, "more than two device arguments given");
|
||||
break;
|
||||
case ARGP_KEY_FINI:
|
||||
if (!args->meta_device)
|
||||
argp_error(state, "no metadata device argument given");
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct argp_option options[] = {
|
||||
{ "show-header", 'h', NULL, 0, "Print image header and exit without processing stream" },
|
||||
{ "readahead", 'r', "NR", 0, "Maintain read-ahead window of NR blocks" },
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
static struct argp read_image_argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
"META-DEVICE",
|
||||
"Read metadata image stream from metadata device file"
|
||||
};
|
||||
|
||||
#define DEFAULT_RA_WINDOW (512 * 1024)
|
||||
|
||||
static int read_image_cmd(int argc, char **argv)
|
||||
{
|
||||
struct image_args image_args = {
|
||||
.is_read = true,
|
||||
.ra_window = DEFAULT_RA_WINDOW,
|
||||
};
|
||||
int ret;
|
||||
|
||||
ret = argp_parse(&read_image_argp, argc, argv, 0, NULL, &image_args);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return do_image(&image_args);
|
||||
}
|
||||
|
||||
static struct argp write_image_argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
"META-DEVICE",
|
||||
"Write metadata image stream to metadata device file"
|
||||
};
|
||||
|
||||
static int write_image_cmd(int argc, char **argv)
|
||||
{
|
||||
struct image_args image_args = {
|
||||
.is_read = false,
|
||||
.ra_window = DEFAULT_RA_WINDOW,
|
||||
};
|
||||
int ret;
|
||||
|
||||
ret = argp_parse(&write_image_argp, argc, argv, 0, NULL, &image_args);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return do_image(&image_args);
|
||||
}
|
||||
|
||||
static void __attribute__((constructor)) image_ctor(void)
|
||||
{
|
||||
cmd_register_argp("read-metadata-image", &read_image_argp, GROUP_CORE, read_image_cmd);
|
||||
cmd_register_argp("write-metadata-image", &write_image_argp, GROUP_CORE, write_image_cmd);
|
||||
}
|
||||
15
utils/src/check/iter.h
Normal file
15
utils/src/check/iter.h
Normal file
@@ -0,0 +1,15 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_ITER_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_ITER_H_
|
||||
|
||||
/*
|
||||
* Callbacks can return a weird -errno that we'll never use to indicate
|
||||
* that iteration can stop and return 0 for success.
|
||||
*/
|
||||
#define ECHECK_ITER_DONE EL2HLT
|
||||
|
||||
static inline int xlate_iter_errno(int ret)
|
||||
{
|
||||
return ret == -ECHECK_ITER_DONE ? 0 : ret;
|
||||
}
|
||||
|
||||
#endif
|
||||
98
utils/src/check/log_trees.c
Normal file
98
utils/src/check/log_trees.c
Normal file
@@ -0,0 +1,98 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "key.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "btree.h"
|
||||
#include "debug.h"
|
||||
#include "extent.h"
|
||||
#include "iter.h"
|
||||
#include "sns.h"
|
||||
#include "log_trees.h"
|
||||
#include "super.h"
|
||||
|
||||
struct iter_args {
|
||||
extent_cb_t cb;
|
||||
void *cb_arg;
|
||||
};
|
||||
|
||||
static int lt_meta_iter(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
|
||||
{
|
||||
struct iter_args *ia = cb_arg;
|
||||
struct scoutfs_log_trees *lt;
|
||||
int ret;
|
||||
|
||||
if (val_len != sizeof(struct scoutfs_log_trees))
|
||||
; /* XXX */
|
||||
|
||||
lt = val;
|
||||
|
||||
sns_push("log_trees", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
|
||||
|
||||
debug("lt rid 0x%16llx nr %llu", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
|
||||
|
||||
sns_push("meta_avail", 0, 0);
|
||||
ret = alloc_list_meta_iter(<->meta_avail, ia->cb, ia->cb_arg);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("meta_freed", 0, 0);
|
||||
ret = alloc_list_meta_iter(<->meta_freed, ia->cb, ia->cb_arg);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("item_root", 0, 0);
|
||||
ret = btree_meta_iter(<->item_root, ia->cb, ia->cb_arg);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (lt->bloom_ref.blkno) {
|
||||
sns_push("bloom_ref", 0, 0);
|
||||
ret = ia->cb(le64_to_cpu(lt->bloom_ref.blkno), 1, ia->cb_arg);
|
||||
sns_pop();
|
||||
if (ret < 0) {
|
||||
ret = xlate_iter_errno(ret);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
sns_push("data_avail", 0, 0);
|
||||
ret = alloc_root_meta_iter(<->data_avail, ia->cb, ia->cb_arg);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("data_freed", 0, 0);
|
||||
ret = alloc_root_meta_iter(<->data_freed, ia->cb, ia->cb_arg);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
sns_pop();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Call the callers callback with the extent of all the metadata block references contained
|
||||
* in log btrees. We walk the logs_root btree items and walk all the metadata structures
|
||||
* they reference.
|
||||
*/
|
||||
int log_trees_meta_iter(extent_cb_t cb, void *cb_arg)
|
||||
{
|
||||
struct scoutfs_super_block *super = global_super;
|
||||
struct iter_args ia = { .cb = cb, .cb_arg = cb_arg };
|
||||
|
||||
return btree_item_iter(&super->logs_root, lt_meta_iter, &ia);
|
||||
}
|
||||
8
utils/src/check/log_trees.h
Normal file
8
utils/src/check/log_trees.h
Normal file
@@ -0,0 +1,8 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
|
||||
|
||||
#include "extent.h"
|
||||
|
||||
int log_trees_meta_iter(extent_cb_t cb, void *cb_arg);
|
||||
|
||||
#endif
|
||||
367
utils/src/check/meta.c
Normal file
367
utils/src/check/meta.c
Normal file
@@ -0,0 +1,367 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <sys/mman.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "bitmap.h"
|
||||
#include "key.h"
|
||||
|
||||
#include "alloc.h"
|
||||
#include "btree.h"
|
||||
#include "debug.h"
|
||||
#include "extent.h"
|
||||
#include "sns.h"
|
||||
#include "log_trees.h"
|
||||
#include "meta.h"
|
||||
#include "problem.h"
|
||||
#include "super.h"
|
||||
|
||||
static struct meta_data {
|
||||
struct extent_root meta_refed;
|
||||
struct extent_root meta_free;
|
||||
struct {
|
||||
u64 ref_blocks;
|
||||
u64 free_extents;
|
||||
u64 free_blocks;
|
||||
} stats;
|
||||
} global_mdat;
|
||||
|
||||
bool valid_meta_blkno(u64 blkno)
|
||||
{
|
||||
u64 tot = le64_to_cpu(global_super->total_meta_blocks);
|
||||
|
||||
return blkno >= SCOUTFS_META_DEV_START_BLKNO && blkno < tot;
|
||||
}
|
||||
|
||||
static bool valid_meta_extent(u64 start, u64 len)
|
||||
{
|
||||
u64 tot = le64_to_cpu(global_super->total_meta_blocks);
|
||||
bool valid;
|
||||
|
||||
valid = len > 0 &&
|
||||
start >= SCOUTFS_META_DEV_START_BLKNO &&
|
||||
start < tot &&
|
||||
len <= tot &&
|
||||
((start + len) <= tot) &&
|
||||
((start + len) > start);
|
||||
|
||||
debug("start %llu len %llu valid %u", start, len, !!valid);
|
||||
|
||||
if (!valid)
|
||||
problem(PB_META_EXTENT_INVALID, "start %llu len %llu", start, len);
|
||||
|
||||
return valid;
|
||||
}
|
||||
|
||||
/*
|
||||
* Track references to individual metadata blocks. This uses the extent
|
||||
* callback type but is only ever called for single block references.
|
||||
* Any reference to a block that has already been referenced is
|
||||
* considered invalid and is ignored. Later repair will resolve
|
||||
* duplicate references.
|
||||
*/
|
||||
static int insert_meta_ref(u64 start, u64 len, void *arg)
|
||||
{
|
||||
struct meta_data *mdat = &global_mdat;
|
||||
struct extent_root *root = arg;
|
||||
int ret = 0;
|
||||
|
||||
/* this is tracking single metadata block references */
|
||||
if (len != 1) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (valid_meta_blkno(start)) {
|
||||
ret = extent_insert_new(root, start, len);
|
||||
if (ret == 0)
|
||||
mdat->stats.ref_blocks++;
|
||||
else if (ret == -EEXIST)
|
||||
problem(PB_META_REF_OVERLAPS_EXISTING, "blkno %llu", start);
|
||||
}
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int insert_meta_free(u64 start, u64 len, void *arg)
|
||||
{
|
||||
struct meta_data *mdat = &global_mdat;
|
||||
struct extent_root *root = arg;
|
||||
int ret = 0;
|
||||
|
||||
if (valid_meta_extent(start, len)) {
|
||||
ret = extent_insert_new(root, start, len);
|
||||
if (ret == 0) {
|
||||
mdat->stats.free_extents++;
|
||||
mdat->stats.free_blocks++;
|
||||
|
||||
} else if (ret == -EEXIST) {
|
||||
problem(PB_META_FREE_OVERLAPS_EXISTING,
|
||||
"start %llu llen %llu", start, len);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Walk all metadata references in the system. This walk doesn't need
|
||||
* to read metadata that doesn't contain any metadata references so it
|
||||
* can skip the bulk of metadata blocks. This gives us the set of
|
||||
* referenced metadata blocks which we can then use to repair metadata
|
||||
* allocator structures.
|
||||
*/
|
||||
static int get_meta_refs(void)
|
||||
{
|
||||
struct meta_data *mdat = &global_mdat;
|
||||
struct scoutfs_super_block *super = global_super;
|
||||
int ret;
|
||||
|
||||
extent_root_init(&mdat->meta_refed);
|
||||
|
||||
/* XXX record reserved blocks around super as referenced */
|
||||
|
||||
sns_push("meta_alloc", 0, 0);
|
||||
ret = alloc_root_meta_iter(&super->meta_alloc[0], insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("meta_alloc", 1, 0);
|
||||
ret = alloc_root_meta_iter(&super->meta_alloc[1], insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("data_alloc", 1, 0);
|
||||
ret = alloc_root_meta_iter(&super->data_alloc, insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_avail", 0, 0);
|
||||
ret = alloc_list_meta_iter(&super->server_meta_avail[0],
|
||||
insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_avail", 1, 0);
|
||||
ret = alloc_list_meta_iter(&super->server_meta_avail[1],
|
||||
insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_freed", 0, 0);
|
||||
ret = alloc_list_meta_iter(&super->server_meta_freed[0],
|
||||
insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_freed", 1, 0);
|
||||
ret = alloc_list_meta_iter(&super->server_meta_freed[1],
|
||||
insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("fs_root", 0, 0);
|
||||
ret = btree_meta_iter(&super->fs_root, insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("logs_root", 0, 0);
|
||||
ret = btree_meta_iter(&super->logs_root, insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("log_merge", 0, 0);
|
||||
ret = btree_meta_iter(&super->log_merge, insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("mounted_clients", 0, 0);
|
||||
ret = btree_meta_iter(&super->mounted_clients, insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("srch_root", 0, 0);
|
||||
ret = btree_meta_iter(&super->srch_root, insert_meta_ref, &mdat->meta_refed);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = log_trees_meta_iter(insert_meta_ref, &mdat->meta_refed);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
debug("found %llu referenced metadata blocks", mdat->stats.ref_blocks);
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int get_meta_free(void)
|
||||
{
|
||||
struct meta_data *mdat = &global_mdat;
|
||||
struct scoutfs_super_block *super = global_super;
|
||||
int ret;
|
||||
|
||||
extent_root_init(&mdat->meta_free);
|
||||
|
||||
sns_push("meta_alloc", 0, 0);
|
||||
ret = alloc_root_extent_iter(&super->meta_alloc[0], insert_meta_free, &mdat->meta_free);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("meta_alloc", 1, 0);
|
||||
ret = alloc_root_extent_iter(&super->meta_alloc[1], insert_meta_free, &mdat->meta_free);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_avail", 0, 0);
|
||||
ret = alloc_list_extent_iter(&super->server_meta_avail[0],
|
||||
insert_meta_free, &mdat->meta_free);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_avail", 1, 0);
|
||||
ret = alloc_list_extent_iter(&super->server_meta_avail[1],
|
||||
insert_meta_free, &mdat->meta_free);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_freed", 0, 0);
|
||||
ret = alloc_list_extent_iter(&super->server_meta_freed[0],
|
||||
insert_meta_free, &mdat->meta_free);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sns_push("server_meta_freed", 1, 0);
|
||||
ret = alloc_list_extent_iter(&super->server_meta_freed[1],
|
||||
insert_meta_free, &mdat->meta_free);
|
||||
sns_pop();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
debug("found %llu free metadata blocks in %llu extents",
|
||||
mdat->stats.free_blocks, mdat->stats.free_extents);
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* All the space between referenced blocks must be recorded in the free
|
||||
* extents. The free extent walk didn't check that the extents
|
||||
* overlapped with references, we do that here. Remember that metadata
|
||||
* block references were merged into extents here, the refed extents
|
||||
* aren't necessarily all a single block.
|
||||
*/
|
||||
static int compare_refs_and_free(void)
|
||||
{
|
||||
struct meta_data *mdat = &global_mdat;
|
||||
struct extent_node *ref;
|
||||
struct extent_node *free;
|
||||
struct extent_node *next;
|
||||
struct extent_node *prev;
|
||||
u64 expect;
|
||||
u64 start;
|
||||
u64 end;
|
||||
|
||||
expect = 0;
|
||||
ref = extent_first(&mdat->meta_refed);
|
||||
free = extent_first(&mdat->meta_free);
|
||||
while (ref || free) {
|
||||
|
||||
debug("exp %llu ref %llu.%llu free %llu.%llu",
|
||||
expect, ref ? ref->start : 0, ref ? ref->len : 0,
|
||||
free ? free->start : 0, free ? free->len : 0);
|
||||
|
||||
/* referenced marked free, remove ref from free and continue from same point */
|
||||
if (ref && free && extents_overlap(ref->start, ref->len, free->start, free->len)) {
|
||||
debug("ref extent %llu.%llu overlaps free %llu %llu",
|
||||
ref->start, ref->len, free->start, free->len);
|
||||
|
||||
start = max(ref->start, free->start);
|
||||
end = min(ref->start + ref->len, free->start + free->len);
|
||||
|
||||
prev = extent_prev(free);
|
||||
|
||||
extent_remove(&mdat->meta_free, start, end - start);
|
||||
|
||||
if (prev)
|
||||
free = extent_next(prev);
|
||||
else
|
||||
free = extent_first(&mdat->meta_free);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* see which extent starts earlier */
|
||||
if (!free || (ref && ref->start <= free->start))
|
||||
next = ref;
|
||||
else
|
||||
next = free;
|
||||
|
||||
/* untracked region before next extent */
|
||||
if (expect < next->start) {
|
||||
debug("missing free extent %llu.%llu", expect, next->start - expect);
|
||||
expect = next->start;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
/* didn't overlap, advance past next extent */
|
||||
expect = next->start + next->len;
|
||||
if (next == ref)
|
||||
ref = extent_next(ref);
|
||||
else
|
||||
free = extent_next(free);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check the metadata allocators by comparing the set of referenced
|
||||
* blocks with the set of free blocks that are stored in free btree
|
||||
* items and alloc list blocks.
|
||||
*/
|
||||
int check_meta_alloc(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = get_meta_refs();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = get_meta_free();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = compare_refs_and_free();
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
9
utils/src/check/meta.h
Normal file
9
utils/src/check/meta.h
Normal file
@@ -0,0 +1,9 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_META_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_META_H_
|
||||
|
||||
bool valid_meta_blkno(u64 blkno);
|
||||
|
||||
int check_meta_alloc(void);
|
||||
|
||||
#endif
|
||||
|
||||
23
utils/src/check/padding.c
Normal file
23
utils/src/check/padding.c
Normal file
@@ -0,0 +1,23 @@
|
||||
#include <string.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "padding.h"
|
||||
|
||||
bool padding_is_zeros(const void *data, size_t sz)
|
||||
{
|
||||
static char zeros[32] = {0,};
|
||||
const size_t batch = array_size(zeros);
|
||||
|
||||
while (sz >= batch) {
|
||||
if (memcmp(data, zeros, batch))
|
||||
return false;
|
||||
data += batch;
|
||||
sz -= batch;
|
||||
}
|
||||
|
||||
if (sz > 0 && memcmp(data, zeros, sz))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
6
utils/src/check/padding.h
Normal file
6
utils/src/check/padding.h
Normal file
@@ -0,0 +1,6 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_PADDING_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_PADDING_H_
|
||||
|
||||
bool padding_is_zeros(const void *data, size_t sz);
|
||||
|
||||
#endif
|
||||
44
utils/src/check/problem.c
Normal file
44
utils/src/check/problem.c
Normal file
@@ -0,0 +1,44 @@
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "problem.h"
|
||||
|
||||
#define PROB_STR(pb) [pb] = #pb
|
||||
char *prob_strs[] = {
|
||||
PROB_STR(PB_META_EXTENT_INVALID),
|
||||
PROB_STR(PB_META_REF_OVERLAPS_EXISTING),
|
||||
PROB_STR(PB_META_FREE_OVERLAPS_EXISTING),
|
||||
PROB_STR(PB_BTREE_BLOCK_BAD_LEVEL),
|
||||
PROB_STR(PB_SB_HDR_CRC_INVALID),
|
||||
PROB_STR(PB_SB_HDR_MAGIC_INVALID),
|
||||
PROB_STR(PB_FS_IN_USE),
|
||||
PROB_STR(PB_MOUNTED_CLIENTS_REF_BLKNO),
|
||||
PROB_STR(PB_SB_BAD_FLAG),
|
||||
PROB_STR(PB_SB_BAD_FMT_VERS),
|
||||
PROB_STR(PB_QCONF_WRONG_VERSION),
|
||||
PROB_STR(PB_QSLOT_BAD_FAM),
|
||||
PROB_STR(PB_QSLOT_BAD_PORT),
|
||||
PROB_STR(PB_QSLOT_NO_ADDR),
|
||||
PROB_STR(PB_QSLOT_BAD_ADDR),
|
||||
PROB_STR(PB_DATA_DEV_SB_INVALID),
|
||||
};
|
||||
|
||||
static struct problem_data {
|
||||
uint64_t counts[PB__NR];
|
||||
uint64_t count;
|
||||
} global_pdat;
|
||||
|
||||
void problem_record(prob_t pb)
|
||||
{
|
||||
struct problem_data *pdat = &global_pdat;
|
||||
|
||||
pdat->counts[pb]++;
|
||||
pdat->count++;
|
||||
}
|
||||
|
||||
uint64_t problems_count(void)
|
||||
{
|
||||
struct problem_data *pdat = &global_pdat;
|
||||
|
||||
return pdat->count;
|
||||
}
|
||||
38
utils/src/check/problem.h
Normal file
38
utils/src/check/problem.h
Normal file
@@ -0,0 +1,38 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_PROBLEM_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_PROBLEM_H_
|
||||
|
||||
#include "debug.h"
|
||||
#include "sns.h"
|
||||
|
||||
typedef enum {
|
||||
PB_META_EXTENT_INVALID,
|
||||
PB_META_REF_OVERLAPS_EXISTING,
|
||||
PB_META_FREE_OVERLAPS_EXISTING,
|
||||
PB_BTREE_BLOCK_BAD_LEVEL,
|
||||
PB_SB_HDR_CRC_INVALID,
|
||||
PB_SB_HDR_MAGIC_INVALID,
|
||||
PB_FS_IN_USE,
|
||||
PB_MOUNTED_CLIENTS_REF_BLKNO,
|
||||
PB_SB_BAD_FLAG,
|
||||
PB_SB_BAD_FMT_VERS,
|
||||
PB_QCONF_WRONG_VERSION,
|
||||
PB_QSLOT_BAD_FAM,
|
||||
PB_QSLOT_BAD_PORT,
|
||||
PB_QSLOT_NO_ADDR,
|
||||
PB_QSLOT_BAD_ADDR,
|
||||
PB_DATA_DEV_SB_INVALID,
|
||||
PB__NR,
|
||||
} prob_t;
|
||||
|
||||
extern char *prob_strs[];
|
||||
|
||||
#define problem(pb, fmt, ...) \
|
||||
do { \
|
||||
debug("problem found: "#pb": %s: "fmt, sns_str(), __VA_ARGS__); \
|
||||
problem_record(pb); \
|
||||
} while (0)
|
||||
|
||||
void problem_record(prob_t pb);
|
||||
uint64_t problems_count(void);
|
||||
|
||||
#endif
|
||||
118
utils/src/check/sns.c
Normal file
118
utils/src/check/sns.c
Normal file
@@ -0,0 +1,118 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "sns.h"
|
||||
|
||||
/*
|
||||
* This "str num stack" is used to describe our location in metadata at
|
||||
* any given time.
|
||||
*
|
||||
* As we descend into structures we pop a string on decribing them,
|
||||
* perhaps with associated numbers. Pushing and popping is very cheap
|
||||
* and only rarely do we format the stack into a string, as an arbitrary
|
||||
* example:
|
||||
* super.fs_root.btree_parent:1231.btree_leaf:3231"
|
||||
*/
|
||||
|
||||
#define SNS_MAX_DEPTH 1000
|
||||
#define SNS_STR_SIZE (SNS_MAX_DEPTH * (SNS_MAX_STR_LEN + 1 + 16 + 1))
|
||||
|
||||
static struct sns_data {
|
||||
unsigned int depth;
|
||||
|
||||
struct sns_entry {
|
||||
char *str;
|
||||
size_t len;
|
||||
u64 a;
|
||||
u64 b;
|
||||
} ents[SNS_MAX_DEPTH];
|
||||
|
||||
char str[SNS_STR_SIZE];
|
||||
|
||||
} global_lsdat;
|
||||
|
||||
void _sns_push(char *str, size_t len, u64 a, u64 b)
|
||||
{
|
||||
struct sns_data *lsdat = &global_lsdat;
|
||||
|
||||
if (lsdat->depth < SNS_MAX_DEPTH) {
|
||||
lsdat->ents[lsdat->depth++] = (struct sns_entry) {
|
||||
.str = str,
|
||||
.len = len,
|
||||
.a = a,
|
||||
.b = b,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
void sns_pop(void)
|
||||
{
|
||||
struct sns_data *lsdat = &global_lsdat;
|
||||
|
||||
if (lsdat->depth > 0)
|
||||
lsdat->depth--;
|
||||
}
|
||||
|
||||
static char *append_str(char *pos, char *str, size_t len)
|
||||
{
|
||||
memcpy(pos, str, len);
|
||||
return pos + len;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is not called for x = 0 so we don't need to emit an initial 0.
|
||||
* We could by using do {} while instead of while {}.
|
||||
*/
|
||||
static char *append_u64x(char *pos, u64 x)
|
||||
{
|
||||
static char hex[] = "0123456789abcdef";
|
||||
|
||||
while (x) {
|
||||
*pos++ = hex[x & 0xf];
|
||||
x >>= 4;
|
||||
}
|
||||
|
||||
return pos;
|
||||
}
|
||||
|
||||
static char *append_char(char *pos, char c)
|
||||
{
|
||||
*(pos++) = c;
|
||||
return pos;
|
||||
}
|
||||
|
||||
/*
|
||||
* Return a pointer to a null terminated string that describes the
|
||||
* current location stack. The string buffer is global.
|
||||
*/
|
||||
char *sns_str(void)
|
||||
{
|
||||
struct sns_data *lsdat = &global_lsdat;
|
||||
struct sns_entry *ent;
|
||||
char *pos;
|
||||
int i;
|
||||
|
||||
pos = lsdat->str;
|
||||
for (i = 0; i < lsdat->depth; i++) {
|
||||
ent = &lsdat->ents[i];
|
||||
|
||||
if (i)
|
||||
pos = append_char(pos, '.');
|
||||
|
||||
pos = append_str(pos, ent->str, ent->len);
|
||||
|
||||
if (ent->a) {
|
||||
pos = append_char(pos, ':');
|
||||
pos = append_u64x(pos, ent->a);
|
||||
}
|
||||
|
||||
if (ent->b) {
|
||||
pos = append_char(pos, ':');
|
||||
pos = append_u64x(pos, ent->b);
|
||||
}
|
||||
}
|
||||
|
||||
*pos = '\0';
|
||||
|
||||
return lsdat->str;
|
||||
}
|
||||
20
utils/src/check/sns.h
Normal file
20
utils/src/check/sns.h
Normal file
@@ -0,0 +1,20 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_SNS_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_SNS_H_
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "sparse.h"
|
||||
|
||||
#define SNS_MAX_STR_LEN 20
|
||||
|
||||
#define sns_push(str, a, b) \
|
||||
do { \
|
||||
build_assert(sizeof(str) - 1 <= SNS_MAX_STR_LEN); \
|
||||
_sns_push((str), sizeof(str) - 1, a, b); \
|
||||
} while (0)
|
||||
|
||||
void _sns_push(char *str, size_t len, u64 a, u64 b);
|
||||
void sns_pop(void);
|
||||
char *sns_str(void);
|
||||
|
||||
#endif
|
||||
252
utils/src/check/super.c
Normal file
252
utils/src/check/super.c
Normal file
@@ -0,0 +1,252 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <errno.h>
|
||||
#include <sys/socket.h>
|
||||
#include <arpa/inet.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "crc.h"
|
||||
|
||||
#include "block.h"
|
||||
#include "super.h"
|
||||
#include "problem.h"
|
||||
|
||||
/*
|
||||
* After we check the super blocks we provide a global buffer to track
|
||||
* the current super block. It is referenced to get static information
|
||||
* about the system and is also modified and written as part of
|
||||
* transactions.
|
||||
*/
|
||||
struct scoutfs_super_block *global_super;
|
||||
|
||||
/*
|
||||
* Check superblock crc. We can't use global_super here since it's not the
|
||||
* whole block itself, but only the struct scoutfs_super_block, so it needs
|
||||
* to reload a copy here.
|
||||
*/
|
||||
int check_super_crc(void)
|
||||
{
|
||||
struct scoutfs_super_block *super = NULL;
|
||||
struct scoutfs_block_header *hdr;
|
||||
struct block *blk = NULL;
|
||||
u32 crc;
|
||||
int ret;
|
||||
|
||||
ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_DIRTY);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "error reading super block\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
super = block_buf(blk);
|
||||
crc = crc_block((struct scoutfs_block_header *)super, block_size(blk));
|
||||
hdr = &global_super->hdr;
|
||||
debug("superblock crc 0x%04x calculated 0x%04x " "%s", le32_to_cpu(hdr->crc), crc, le32_to_cpu(hdr->crc) == crc ? "(match)" : "(mismatch)");
|
||||
|
||||
if (crc != le32_to_cpu(hdr->crc))
|
||||
problem(PB_SB_HDR_CRC_INVALID, "crc 0x%04x calculated 0x%04x", le32_to_cpu(hdr->crc), crc);
|
||||
block_put(&blk);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Crude check for the unlikely cases where the fs appears to still be mounted.
|
||||
*/
|
||||
int check_super_in_use(int meta_fd)
|
||||
{
|
||||
int ret = meta_super_in_use(meta_fd, global_super);
|
||||
debug("meta_super_in_use ret %d", ret);
|
||||
|
||||
if (ret < 0)
|
||||
problem(PB_FS_IN_USE, "File system appears in use. ret %d", ret);
|
||||
|
||||
debug("global_super->mounted_clients.ref.blkno 0x%08llx", global_super->mounted_clients.ref.blkno);
|
||||
if (global_super->mounted_clients.ref.blkno != 0)
|
||||
problem(PB_MOUNTED_CLIENTS_REF_BLKNO, "Mounted clients ref blkno 0x%08llx",
|
||||
global_super->mounted_clients.ref.blkno);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* quick glance data device superblock checks.
|
||||
*
|
||||
* -EIO for crc failures, all others -EINVAL
|
||||
*
|
||||
* caller must have run check_supers() first so that global_super is
|
||||
* setup, so that we can cross-ref to it.
|
||||
*/
|
||||
static int check_data_super(int data_fd)
|
||||
{
|
||||
struct scoutfs_super_block *super = NULL;
|
||||
char *buf;
|
||||
int ret = 0;
|
||||
u32 crc;
|
||||
ssize_t size = SCOUTFS_BLOCK_SM_SIZE;
|
||||
off_t off = SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT;
|
||||
|
||||
buf = aligned_alloc(4096, size); /* XXX static alignment :/ */
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
memset(buf, 0, size);
|
||||
|
||||
if (lseek(data_fd, off, SEEK_SET) != off)
|
||||
return -errno;
|
||||
|
||||
if (read(data_fd, buf, size) < 0) {
|
||||
ret = -errno;
|
||||
goto out;
|
||||
}
|
||||
|
||||
super = (struct scoutfs_super_block *)buf;
|
||||
|
||||
crc = crc_block((struct scoutfs_block_header *)buf, size);
|
||||
|
||||
debug("data fsid 0x%016llx", le64_to_cpu(super->hdr.fsid));
|
||||
debug("data super magic 0x%04x", super->hdr.magic);
|
||||
debug("data crc calc 0x%08x exp 0x%08x %s", crc, le32_to_cpu(super->hdr.crc),
|
||||
crc == le32_to_cpu(super->hdr.crc) ? "(match)" : "(mismatch)");
|
||||
debug("data flags %llu fmt_vers %llu", le64_to_cpu(super->flags), le64_to_cpu(super->fmt_vers));
|
||||
|
||||
if (crc != le32_to_cpu(super->hdr.crc))
|
||||
/* tis but a scratch */
|
||||
ret = -EIO;
|
||||
|
||||
if (le64_to_cpu(super->hdr.fsid) != le64_to_cpu(global_super->hdr.fsid))
|
||||
/* mismatched data bdev? not good */
|
||||
ret = -EINVAL;
|
||||
|
||||
if (le32_to_cpu(super->hdr.magic) != SCOUTFS_BLOCK_MAGIC_SUPER)
|
||||
/* fsid matched but not a superblock? yikes */
|
||||
ret = -EINVAL;
|
||||
|
||||
if (le64_to_cpu(super->flags) != 0) /* !SCOUTFS_FLAG_IS_META_BDEV */
|
||||
ret = -EINVAL;
|
||||
|
||||
if ((le64_to_cpu(super->fmt_vers) < SCOUTFS_FORMAT_VERSION_MIN) ||
|
||||
(le64_to_cpu(super->fmt_vers) > SCOUTFS_FORMAT_VERSION_MAX))
|
||||
ret = -EINVAL;
|
||||
|
||||
if (ret != 0)
|
||||
problem(PB_DATA_DEV_SB_INVALID, "data device is invalid or corrupt (%d)", ret);
|
||||
out:
|
||||
free(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* After checking the supers we save a copy of it in a global buffer that's used by
|
||||
* other modules to track the current super. It can be modified and written during commits.
|
||||
*/
|
||||
int check_supers(int data_fd)
|
||||
{
|
||||
struct scoutfs_super_block *super = NULL;
|
||||
struct block *blk = NULL;
|
||||
struct scoutfs_quorum_slot* slot = NULL;
|
||||
struct in_addr in;
|
||||
uint16_t family;
|
||||
uint16_t port;
|
||||
int ret;
|
||||
|
||||
sns_push("supers", 0, 0);
|
||||
|
||||
global_super = malloc(sizeof(struct scoutfs_super_block));
|
||||
if (!global_super) {
|
||||
fprintf(stderr, "error allocating super block buffer\n");
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "error reading super block\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = block_hdr_valid(blk, SCOUTFS_SUPER_BLKNO, BF_SM, SCOUTFS_BLOCK_MAGIC_SUPER);
|
||||
|
||||
super = block_buf(blk);
|
||||
|
||||
if (ret < 0) {
|
||||
/* */
|
||||
if (ret == -EINVAL) {
|
||||
/* that's really bad */
|
||||
fprintf(stderr, "superblock invalid magic\n");
|
||||
goto out;
|
||||
} else if (ret == -EIO)
|
||||
/* just report/count a CRC error */
|
||||
problem(PB_SB_HDR_MAGIC_INVALID, "superblock magic invalid: 0x%04x is not 0x%04x",
|
||||
super->hdr.magic, SCOUTFS_BLOCK_MAGIC_SUPER);
|
||||
}
|
||||
|
||||
memcpy(global_super, super, sizeof(struct scoutfs_super_block));
|
||||
|
||||
debug("Superblock flag: %llu", global_super->flags);
|
||||
if (le64_to_cpu(global_super->flags) != SCOUTFS_FLAG_IS_META_BDEV)
|
||||
problem(PB_SB_BAD_FLAG, "Bad flag: %llu expecting: 1 or 0", global_super->flags);
|
||||
|
||||
debug("Superblock fmt_vers: %llu", le64_to_cpu(global_super->fmt_vers));
|
||||
if ((le64_to_cpu(global_super->fmt_vers) < SCOUTFS_FORMAT_VERSION_MIN) ||
|
||||
(le64_to_cpu(global_super->fmt_vers) > SCOUTFS_FORMAT_VERSION_MAX))
|
||||
problem(PB_SB_BAD_FMT_VERS, "Bad fmt_vers: %llu outside supported range (%d-%d)",
|
||||
le64_to_cpu(global_super->fmt_vers), SCOUTFS_FORMAT_VERSION_MIN,
|
||||
SCOUTFS_FORMAT_VERSION_MAX);
|
||||
|
||||
debug("Quorum Config Version: %llu", global_super->qconf.version);
|
||||
if (le64_to_cpu(global_super->qconf.version) != 1)
|
||||
problem(PB_QCONF_WRONG_VERSION, "Wrong Version: %llu (expected 1)", global_super->qconf.version);
|
||||
|
||||
for (int i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
slot = &global_super->qconf.slots[i];
|
||||
family = le16_to_cpu(slot->addr.v4.family);
|
||||
port = le16_to_cpu(slot->addr.v4.port);
|
||||
in.s_addr = le32_to_cpu(slot->addr.v4.addr);
|
||||
|
||||
if (family == SCOUTFS_AF_NONE) {
|
||||
debug("Quorum slot %u is empty", i);
|
||||
continue;
|
||||
}
|
||||
|
||||
debug("Quorum slot %u family: %u, port: %u, address: %s", i, family, port, inet_ntoa(in));
|
||||
if (family != SCOUTFS_AF_IPV4)
|
||||
problem(PB_QSLOT_BAD_FAM, "Quorum Slot %u doesn't have valid address", i);
|
||||
|
||||
if (port == 0)
|
||||
problem(PB_QSLOT_BAD_PORT, "Quorum Slot %u has bad port", i);
|
||||
|
||||
if (!in.s_addr) {
|
||||
problem(PB_QSLOT_NO_ADDR, "Quorum Slot %u has not been assigned ipv4 address", i);
|
||||
} else if (!(in.s_addr & 0xff000000)) {
|
||||
problem(PB_QSLOT_BAD_ADDR, "Quorum Slot %u has invalid ipv4 address", i);
|
||||
} else if ((in.s_addr & 0xff) == 0xff) {
|
||||
problem(PB_QSLOT_BAD_ADDR, "Quorum Slot %u has invalid ipv4 address", i);
|
||||
}
|
||||
}
|
||||
|
||||
debug("super magic 0x%04x", global_super->hdr.magic);
|
||||
if (le32_to_cpu(global_super->hdr.magic) != SCOUTFS_BLOCK_MAGIC_SUPER)
|
||||
problem(PB_SB_HDR_MAGIC_INVALID, "superblock magic invalid: 0x%04x is not 0x%04x",
|
||||
global_super->hdr.magic, SCOUTFS_BLOCK_MAGIC_SUPER);
|
||||
|
||||
/* `scoutfs image` command doesn't open data_fd */
|
||||
if (data_fd < 0)
|
||||
ret = 0;
|
||||
else
|
||||
ret = check_data_super(data_fd);
|
||||
out:
|
||||
block_put(&blk);
|
||||
|
||||
sns_pop();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void super_shutdown(void)
|
||||
{
|
||||
free(global_super);
|
||||
}
|
||||
12
utils/src/check/super.h
Normal file
12
utils/src/check/super.h
Normal file
@@ -0,0 +1,12 @@
|
||||
#ifndef _SCOUTFS_UTILS_CHECK_SUPER_H_
|
||||
#define _SCOUTFS_UTILS_CHECK_SUPER_H_
|
||||
|
||||
extern struct scoutfs_super_block *global_super;
|
||||
|
||||
int check_super_crc();
|
||||
int check_supers(int data_fd);
|
||||
int super_commit(void);
|
||||
int check_super_in_use(int meta_fd);
|
||||
void super_shutdown(void);
|
||||
|
||||
#endif
|
||||
@@ -156,6 +156,16 @@ static inline void list_move_tail(struct list_head *list,
|
||||
list_add_tail(list, head);
|
||||
}
|
||||
|
||||
/**
|
||||
* list_is_head - tests whether @list is the list @head
|
||||
* @list: the entry to test
|
||||
* @head: the head of the list
|
||||
*/
|
||||
static inline int list_is_head(const struct list_head *list, const struct list_head *head)
|
||||
{
|
||||
return list == head;
|
||||
}
|
||||
|
||||
/**
|
||||
* list_empty - tests whether a list is empty
|
||||
* @head: the list to test.
|
||||
@@ -242,6 +252,15 @@ static inline void list_splice_init(struct list_head *list,
|
||||
for (pos = (head)->next, n = pos->next; pos != (head); \
|
||||
pos = n, n = pos->next)
|
||||
|
||||
/**
|
||||
* list_entry_is_head - test if the entry points to the head of the list
|
||||
* @pos: the type * to cursor
|
||||
* @head: the head for your list.
|
||||
* @member: the name of the list_head within the struct.
|
||||
*/
|
||||
#define list_entry_is_head(pos, head, member) \
|
||||
(&pos->member == (head))
|
||||
|
||||
/**
|
||||
* list_for_each_entry - iterate over list of given type
|
||||
* @pos: the type * to use as a loop counter.
|
||||
@@ -307,4 +326,28 @@ static inline void list_splice_init(struct list_head *list,
|
||||
#define list_next_entry(pos, member) \
|
||||
list_entry((pos)->member.next, typeof(*(pos)), member)
|
||||
|
||||
/**
|
||||
* list_prev_entry - get the prev element in list
|
||||
* @pos: the type * to cursor
|
||||
* @member: the name of the list_head within the struct.
|
||||
*/
|
||||
#define list_prev_entry(pos, member) \
|
||||
list_entry((pos)->member.prev, typeof(*(pos)), member)
|
||||
|
||||
/**
|
||||
* list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
|
||||
* @pos: the type * to use as a loop cursor.
|
||||
* @n: another type * to use as temporary storage
|
||||
* @head: the head for your list.
|
||||
* @member: the name of the list_head within the struct.
|
||||
*
|
||||
* Iterate backwards over list of given type, safe against removal
|
||||
* of list entry.
|
||||
*/
|
||||
#define list_for_each_entry_safe_reverse(pos, n, head, member) \
|
||||
for (pos = list_last_entry(head, typeof(*pos), member), \
|
||||
n = list_prev_entry(pos, member); \
|
||||
!list_entry_is_head(pos, head, member); \
|
||||
pos = n, n = list_prev_entry(n, member))
|
||||
|
||||
#endif
|
||||
|
||||
24
utils/src/lk_rbtree_wrapper.h
Normal file
24
utils/src/lk_rbtree_wrapper.h
Normal file
@@ -0,0 +1,24 @@
|
||||
#ifndef _LK_RBTREE_WRAPPER_H_
|
||||
#define _LK_RBTREE_WRAPPER_H_
|
||||
|
||||
/*
|
||||
* We're using this lame hack to build and use the kernel's rbtree in
|
||||
* userspace. We drop the kernel's rbtree*[ch] implementation in and
|
||||
* use them with this wrapper. We only have to remove the kernel
|
||||
* includes from the imported files.
|
||||
*/
|
||||
|
||||
#include <stdbool.h>
|
||||
#include "util.h"
|
||||
|
||||
#define rcu_assign_pointer(a, b) do { a = b; } while (0)
|
||||
#define READ_ONCE(a) ({ a; })
|
||||
#define WRITE_ONCE(a, b) do { a = b; } while (0)
|
||||
#define unlikely(a) ({ a; })
|
||||
#define EXPORT_SYMBOL(a) /* nop */
|
||||
|
||||
#include "rbtree_types.h"
|
||||
#include "rbtree.h"
|
||||
#include "rbtree_augmented.h"
|
||||
|
||||
#endif
|
||||
24
utils/src/mode_types.c
Normal file
24
utils/src/mode_types.c
Normal file
@@ -0,0 +1,24 @@
|
||||
#include <unistd.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "mode_types.h"
|
||||
|
||||
unsigned int mode_to_type(mode_t mode)
|
||||
{
|
||||
#define S_SHIFT 12
|
||||
static unsigned char mode_types[S_IFMT >> S_SHIFT] = {
|
||||
[S_IFIFO >> S_SHIFT] = SCOUTFS_DT_FIFO,
|
||||
[S_IFCHR >> S_SHIFT] = SCOUTFS_DT_CHR,
|
||||
[S_IFDIR >> S_SHIFT] = SCOUTFS_DT_DIR,
|
||||
[S_IFBLK >> S_SHIFT] = SCOUTFS_DT_BLK,
|
||||
[S_IFREG >> S_SHIFT] = SCOUTFS_DT_REG,
|
||||
[S_IFLNK >> S_SHIFT] = SCOUTFS_DT_LNK,
|
||||
[S_IFSOCK >> S_SHIFT] = SCOUTFS_DT_SOCK,
|
||||
};
|
||||
|
||||
return mode_types[(mode & S_IFMT) >> S_SHIFT];
|
||||
#undef S_SHIFT
|
||||
}
|
||||
6
utils/src/mode_types.h
Normal file
6
utils/src/mode_types.h
Normal file
@@ -0,0 +1,6 @@
|
||||
#ifndef _MODE_TYPES_H_
|
||||
#define _MODE_TYPES_H_
|
||||
|
||||
unsigned int mode_to_type(mode_t mode);
|
||||
|
||||
#endif
|
||||
46
utils/src/name_hash.h
Normal file
46
utils/src/name_hash.h
Normal file
@@ -0,0 +1,46 @@
|
||||
#ifndef _SCOUTFS_NAME_HASH_H_
|
||||
#define _SCOUTFS_NAME_HASH_H_
|
||||
|
||||
#include "hash.h"
|
||||
|
||||
/*
|
||||
* Test a bit number as though an array of bytes is a large len-bit
|
||||
* big-endian value. nr 0 is the LSB of the final byte, nr (len - 1) is
|
||||
* the MSB of the first byte.
|
||||
*/
|
||||
static int test_be_bytes_bit(int nr, const char *bytes, int len)
|
||||
{
|
||||
return bytes[(len - 1 - nr) >> 3] & (1 << (nr & 7));
|
||||
}
|
||||
|
||||
/*
|
||||
* Generate a 32bit "fingerprint" of the name by extracting 32 evenly
|
||||
* distributed bits from the name. The intent is to have the sort order
|
||||
* of the fingerprints reflect the memcmp() sort order of the names
|
||||
* while mapping large names down to small fs keys.
|
||||
*
|
||||
* Names that are smaller than 32bits are biased towards the high bits
|
||||
* of the fingerprint so that most significant bits of the fingerprints
|
||||
* consistently reflect the initial characters of the names.
|
||||
*/
|
||||
static inline u32 dirent_name_fingerprint(const char *name, unsigned int name_len)
|
||||
{
|
||||
int name_bits = name_len * 8;
|
||||
int skip = max(name_bits / 32, 1);
|
||||
u32 fp = 0;
|
||||
int f;
|
||||
int n;
|
||||
|
||||
for (f = 31, n = name_bits - 1; f >= 0 && n >= 0; f--, n -= skip)
|
||||
fp |= !!test_be_bytes_bit(n, name, name_bits) << f;
|
||||
|
||||
return fp;
|
||||
}
|
||||
|
||||
static inline u64 dirent_name_hash(const char *name, unsigned int name_len)
|
||||
{
|
||||
return scoutfs_hash32(name, name_len) |
|
||||
((u64)dirent_name_fingerprint(name, name_len) << 32);
|
||||
}
|
||||
|
||||
#endif
|
||||
1979
utils/src/parallel_restore.c
Normal file
1979
utils/src/parallel_restore.c
Normal file
File diff suppressed because it is too large
Load Diff
125
utils/src/parallel_restore.h
Normal file
125
utils/src/parallel_restore.h
Normal file
@@ -0,0 +1,125 @@
|
||||
#ifndef _SCOUTFS_PARALLEL_RESTORE_H_
|
||||
#define _SCOUTFS_PARALLEL_RESTORE_H_
|
||||
|
||||
#include <errno.h>
|
||||
|
||||
struct scoutfs_parallel_restore_progress {
|
||||
struct scoutfs_btree_root fs_items;
|
||||
struct scoutfs_btree_root root_items;
|
||||
struct scoutfs_srch_file sfl;
|
||||
struct scoutfs_block_ref bloom_ref;
|
||||
__le64 inode_count;
|
||||
__le64 max_ino;
|
||||
};
|
||||
|
||||
struct scoutfs_parallel_restore_slice {
|
||||
__le64 fsid;
|
||||
__le64 meta_start;
|
||||
__le64 meta_len;
|
||||
};
|
||||
|
||||
struct scoutfs_parallel_restore_entry {
|
||||
u64 dir_ino;
|
||||
u64 pos;
|
||||
u64 ino;
|
||||
mode_t mode;
|
||||
char *name;
|
||||
unsigned int name_len;
|
||||
};
|
||||
|
||||
struct scoutfs_parallel_restore_xattr {
|
||||
u64 ino;
|
||||
u64 pos;
|
||||
char *name;
|
||||
unsigned int name_len;
|
||||
void *value;
|
||||
unsigned int value_len;
|
||||
};
|
||||
|
||||
struct scoutfs_parallel_restore_inode {
|
||||
/* all inodes */
|
||||
u64 ino;
|
||||
u64 meta_seq;
|
||||
u64 data_seq;
|
||||
u64 nr_xattrs;
|
||||
u32 uid;
|
||||
u32 gid;
|
||||
u32 mode;
|
||||
u32 rdev;
|
||||
u32 flags;
|
||||
u8 pad[4];
|
||||
struct timespec atime;
|
||||
struct timespec ctime;
|
||||
struct timespec mtime;
|
||||
struct timespec crtime;
|
||||
u64 proj;
|
||||
|
||||
/* regular files */
|
||||
u64 data_version;
|
||||
u64 size;
|
||||
bool offline;
|
||||
|
||||
/* only used for directories */
|
||||
u64 nr_subdirs;
|
||||
u64 total_entry_name_bytes;
|
||||
|
||||
/* only used for symlnks */
|
||||
char *target;
|
||||
unsigned int target_len; /* not including null terminator */
|
||||
};
|
||||
|
||||
struct scoutfs_parallel_restore_quota_rule {
|
||||
u64 limit;
|
||||
u8 prio;
|
||||
u8 op;
|
||||
u8 rule_flags;
|
||||
struct quota_rule_name {
|
||||
u64 val;
|
||||
u8 source;
|
||||
u8 flags;
|
||||
} names [3];
|
||||
char *value;
|
||||
unsigned int value_len;
|
||||
};
|
||||
|
||||
typedef __typeof__(EINVAL) spr_err_t;
|
||||
|
||||
struct scoutfs_parallel_restore_writer;
|
||||
|
||||
spr_err_t scoutfs_parallel_restore_create_writer(struct scoutfs_parallel_restore_writer **wrip);
|
||||
void scoutfs_parallel_restore_destroy_writer(struct scoutfs_parallel_restore_writer **wrip);
|
||||
|
||||
spr_err_t scoutfs_parallel_restore_init_slices(struct scoutfs_parallel_restore_writer *wri,
|
||||
struct scoutfs_parallel_restore_slice *slices,
|
||||
int nr);
|
||||
spr_err_t scoutfs_parallel_restore_add_slice(struct scoutfs_parallel_restore_writer *wri,
|
||||
struct scoutfs_parallel_restore_slice *slice);
|
||||
spr_err_t scoutfs_parallel_restore_get_slice(struct scoutfs_parallel_restore_writer *wri,
|
||||
struct scoutfs_parallel_restore_slice *slice);
|
||||
|
||||
spr_err_t scoutfs_parallel_restore_add_inode(struct scoutfs_parallel_restore_writer *wri,
|
||||
struct scoutfs_parallel_restore_inode *inode);
|
||||
spr_err_t scoutfs_parallel_restore_add_entry(struct scoutfs_parallel_restore_writer *wri,
|
||||
struct scoutfs_parallel_restore_entry *entry);
|
||||
spr_err_t scoutfs_parallel_restore_add_xattr(struct scoutfs_parallel_restore_writer *wri,
|
||||
struct scoutfs_parallel_restore_xattr *xattr);
|
||||
|
||||
spr_err_t scoutfs_parallel_restore_get_progress(struct scoutfs_parallel_restore_writer *wri,
|
||||
struct scoutfs_parallel_restore_progress *prog);
|
||||
spr_err_t scoutfs_parallel_restore_add_progress(struct scoutfs_parallel_restore_writer *wri,
|
||||
struct scoutfs_parallel_restore_progress *prog);
|
||||
|
||||
spr_err_t scoutfs_parallel_restore_add_quota_rule(struct scoutfs_parallel_restore_writer *wri,
|
||||
struct scoutfs_parallel_restore_quota_rule *rule);
|
||||
|
||||
spr_err_t scoutfs_parallel_restore_write_buf(struct scoutfs_parallel_restore_writer *wri,
|
||||
void *buf, size_t len, off_t *off_ret,
|
||||
size_t *count_ret);
|
||||
|
||||
spr_err_t scoutfs_parallel_restore_import_super(struct scoutfs_parallel_restore_writer *wri,
|
||||
struct scoutfs_super_block *super, int fd);
|
||||
spr_err_t scoutfs_parallel_restore_export_super(struct scoutfs_parallel_restore_writer *wri,
|
||||
struct scoutfs_super_block *super);
|
||||
|
||||
|
||||
#endif
|
||||
@@ -645,6 +645,8 @@ static int print_alloc_list_block(int fd, char *str, struct scoutfs_block_ref *r
|
||||
u64 blkno;
|
||||
u64 start;
|
||||
u64 len;
|
||||
u64 st;
|
||||
u64 nr;
|
||||
int wid;
|
||||
int ret;
|
||||
int i;
|
||||
@@ -663,27 +665,37 @@ static int print_alloc_list_block(int fd, char *str, struct scoutfs_block_ref *r
|
||||
AL_REF_A(&lblk->next), le32_to_cpu(lblk->start),
|
||||
le32_to_cpu(lblk->nr));
|
||||
|
||||
if (lblk->nr) {
|
||||
wid = printf(" exts: ");
|
||||
start = 0;
|
||||
len = 0;
|
||||
for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
|
||||
if (len == 0)
|
||||
start = le64_to_cpu(lblk->blknos[i]);
|
||||
len++;
|
||||
|
||||
if (i == (le32_to_cpu(lblk->nr) - 1) ||
|
||||
start + len != le64_to_cpu(lblk->blknos[i + 1])) {
|
||||
if (wid >= 72)
|
||||
wid = printf("\n ");
|
||||
|
||||
wid += printf("%llu,%llu ", start, len);
|
||||
len = 0;
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
st = le32_to_cpu(lblk->start);
|
||||
nr = le32_to_cpu(lblk->nr);
|
||||
if (st >= SCOUTFS_ALLOC_LIST_MAX_BLOCKS ||
|
||||
nr > SCOUTFS_ALLOC_LIST_MAX_BLOCKS ||
|
||||
(st + nr) > SCOUTFS_ALLOC_LIST_MAX_BLOCKS) {
|
||||
printf(" (invalid start and nr fields)\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (lblk->nr == 0)
|
||||
goto out;
|
||||
|
||||
wid = printf(" exts: ");
|
||||
start = 0;
|
||||
len = 0;
|
||||
for (i = 0; i < nr; i++) {
|
||||
if (len == 0)
|
||||
start = le64_to_cpu(lblk->blknos[st + i]);
|
||||
len++;
|
||||
|
||||
if (i == (nr - 1) || (start + len) != le64_to_cpu(lblk->blknos[st + i + 1])) {
|
||||
if (wid >= 72)
|
||||
wid = printf("\n ");
|
||||
|
||||
wid += printf("%llu,%llu ", start, len);
|
||||
len = 0;
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
out:
|
||||
next = lblk->next;
|
||||
free(lblk);
|
||||
return print_alloc_list_block(fd, str, &next);
|
||||
|
||||
629
utils/src/rbtree.c
Normal file
629
utils/src/rbtree.c
Normal file
@@ -0,0 +1,629 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
Red Black Trees
|
||||
(C) 1999 Andrea Arcangeli <andrea@suse.de>
|
||||
(C) 2002 David Woodhouse <dwmw2@infradead.org>
|
||||
(C) 2012 Michel Lespinasse <walken@google.com>
|
||||
|
||||
|
||||
linux/lib/rbtree.c
|
||||
*/
|
||||
|
||||
#include "lk_rbtree_wrapper.h"
|
||||
|
||||
/*
|
||||
* red-black trees properties: https://en.wikipedia.org/wiki/Rbtree
|
||||
*
|
||||
* 1) A node is either red or black
|
||||
* 2) The root is black
|
||||
* 3) All leaves (NULL) are black
|
||||
* 4) Both children of every red node are black
|
||||
* 5) Every simple path from root to leaves contains the same number
|
||||
* of black nodes.
|
||||
*
|
||||
* 4 and 5 give the O(log n) guarantee, since 4 implies you cannot have two
|
||||
* consecutive red nodes in a path and every red node is therefore followed by
|
||||
* a black. So if B is the number of black nodes on every simple path (as per
|
||||
* 5), then the longest possible path due to 4 is 2B.
|
||||
*
|
||||
* We shall indicate color with case, where black nodes are uppercase and red
|
||||
* nodes will be lowercase. Unknown color nodes shall be drawn as red within
|
||||
* parentheses and have some accompanying text comment.
|
||||
*/
|
||||
|
||||
/*
|
||||
* Notes on lockless lookups:
|
||||
*
|
||||
* All stores to the tree structure (rb_left and rb_right) must be done using
|
||||
* WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
|
||||
* tree structure as seen in program order.
|
||||
*
|
||||
* These two requirements will allow lockless iteration of the tree -- not
|
||||
* correct iteration mind you, tree rotations are not atomic so a lookup might
|
||||
* miss entire subtrees.
|
||||
*
|
||||
* But they do guarantee that any such traversal will only see valid elements
|
||||
* and that it will indeed complete -- does not get stuck in a loop.
|
||||
*
|
||||
* It also guarantees that if the lookup returns an element it is the 'correct'
|
||||
* one. But not returning an element does _NOT_ mean it's not present.
|
||||
*
|
||||
* NOTE:
|
||||
*
|
||||
* Stores to __rb_parent_color are not important for simple lookups so those
|
||||
* are left undone as of now. Nor did I check for loops involving parent
|
||||
* pointers.
|
||||
*/
|
||||
|
||||
static inline void rb_set_black(struct rb_node *rb)
|
||||
{
|
||||
rb->__rb_parent_color |= RB_BLACK;
|
||||
}
|
||||
|
||||
static inline struct rb_node *rb_red_parent(struct rb_node *red)
|
||||
{
|
||||
return (struct rb_node *)red->__rb_parent_color;
|
||||
}
|
||||
|
||||
/*
|
||||
* Helper function for rotations:
|
||||
* - old's parent and color get assigned to new
|
||||
* - old gets assigned new as a parent and 'color' as a color.
|
||||
*/
|
||||
static inline void
|
||||
__rb_rotate_set_parents(struct rb_node *old, struct rb_node *new,
|
||||
struct rb_root *root, int color)
|
||||
{
|
||||
struct rb_node *parent = rb_parent(old);
|
||||
new->__rb_parent_color = old->__rb_parent_color;
|
||||
rb_set_parent_color(old, new, color);
|
||||
__rb_change_child(old, new, parent, root);
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
__rb_insert(struct rb_node *node, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
|
||||
{
|
||||
struct rb_node *parent = rb_red_parent(node), *gparent, *tmp;
|
||||
|
||||
while (true) {
|
||||
/*
|
||||
* Loop invariant: node is red.
|
||||
*/
|
||||
if (unlikely(!parent)) {
|
||||
/*
|
||||
* The inserted node is root. Either this is the
|
||||
* first node, or we recursed at Case 1 below and
|
||||
* are no longer violating 4).
|
||||
*/
|
||||
rb_set_parent_color(node, NULL, RB_BLACK);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If there is a black parent, we are done.
|
||||
* Otherwise, take some corrective action as,
|
||||
* per 4), we don't want a red root or two
|
||||
* consecutive red nodes.
|
||||
*/
|
||||
if(rb_is_black(parent))
|
||||
break;
|
||||
|
||||
gparent = rb_red_parent(parent);
|
||||
|
||||
tmp = gparent->rb_right;
|
||||
if (parent != tmp) { /* parent == gparent->rb_left */
|
||||
if (tmp && rb_is_red(tmp)) {
|
||||
/*
|
||||
* Case 1 - node's uncle is red (color flips).
|
||||
*
|
||||
* G g
|
||||
* / \ / \
|
||||
* p u --> P U
|
||||
* / /
|
||||
* n n
|
||||
*
|
||||
* However, since g's parent might be red, and
|
||||
* 4) does not allow this, we need to recurse
|
||||
* at g.
|
||||
*/
|
||||
rb_set_parent_color(tmp, gparent, RB_BLACK);
|
||||
rb_set_parent_color(parent, gparent, RB_BLACK);
|
||||
node = gparent;
|
||||
parent = rb_parent(node);
|
||||
rb_set_parent_color(node, parent, RB_RED);
|
||||
continue;
|
||||
}
|
||||
|
||||
tmp = parent->rb_right;
|
||||
if (node == tmp) {
|
||||
/*
|
||||
* Case 2 - node's uncle is black and node is
|
||||
* the parent's right child (left rotate at parent).
|
||||
*
|
||||
* G G
|
||||
* / \ / \
|
||||
* p U --> n U
|
||||
* \ /
|
||||
* n p
|
||||
*
|
||||
* This still leaves us in violation of 4), the
|
||||
* continuation into Case 3 will fix that.
|
||||
*/
|
||||
tmp = node->rb_left;
|
||||
WRITE_ONCE(parent->rb_right, tmp);
|
||||
WRITE_ONCE(node->rb_left, parent);
|
||||
if (tmp)
|
||||
rb_set_parent_color(tmp, parent,
|
||||
RB_BLACK);
|
||||
rb_set_parent_color(parent, node, RB_RED);
|
||||
augment_rotate(parent, node);
|
||||
parent = node;
|
||||
tmp = node->rb_right;
|
||||
}
|
||||
|
||||
/*
|
||||
* Case 3 - node's uncle is black and node is
|
||||
* the parent's left child (right rotate at gparent).
|
||||
*
|
||||
* G P
|
||||
* / \ / \
|
||||
* p U --> n g
|
||||
* / \
|
||||
* n U
|
||||
*/
|
||||
WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
|
||||
WRITE_ONCE(parent->rb_right, gparent);
|
||||
if (tmp)
|
||||
rb_set_parent_color(tmp, gparent, RB_BLACK);
|
||||
__rb_rotate_set_parents(gparent, parent, root, RB_RED);
|
||||
augment_rotate(gparent, parent);
|
||||
break;
|
||||
} else {
|
||||
tmp = gparent->rb_left;
|
||||
if (tmp && rb_is_red(tmp)) {
|
||||
/* Case 1 - color flips */
|
||||
rb_set_parent_color(tmp, gparent, RB_BLACK);
|
||||
rb_set_parent_color(parent, gparent, RB_BLACK);
|
||||
node = gparent;
|
||||
parent = rb_parent(node);
|
||||
rb_set_parent_color(node, parent, RB_RED);
|
||||
continue;
|
||||
}
|
||||
|
||||
tmp = parent->rb_left;
|
||||
if (node == tmp) {
|
||||
/* Case 2 - right rotate at parent */
|
||||
tmp = node->rb_right;
|
||||
WRITE_ONCE(parent->rb_left, tmp);
|
||||
WRITE_ONCE(node->rb_right, parent);
|
||||
if (tmp)
|
||||
rb_set_parent_color(tmp, parent,
|
||||
RB_BLACK);
|
||||
rb_set_parent_color(parent, node, RB_RED);
|
||||
augment_rotate(parent, node);
|
||||
parent = node;
|
||||
tmp = node->rb_left;
|
||||
}
|
||||
|
||||
/* Case 3 - left rotate at gparent */
|
||||
WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
|
||||
WRITE_ONCE(parent->rb_left, gparent);
|
||||
if (tmp)
|
||||
rb_set_parent_color(tmp, gparent, RB_BLACK);
|
||||
__rb_rotate_set_parents(gparent, parent, root, RB_RED);
|
||||
augment_rotate(gparent, parent);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Inline version for rb_erase() use - we want to be able to inline
|
||||
* and eliminate the dummy_rotate callback there
|
||||
*/
|
||||
static __always_inline void
|
||||
____rb_erase_color(struct rb_node *parent, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
|
||||
{
|
||||
struct rb_node *node = NULL, *sibling, *tmp1, *tmp2;
|
||||
|
||||
while (true) {
|
||||
/*
|
||||
* Loop invariants:
|
||||
* - node is black (or NULL on first iteration)
|
||||
* - node is not the root (parent is not NULL)
|
||||
* - All leaf paths going through parent and node have a
|
||||
* black node count that is 1 lower than other leaf paths.
|
||||
*/
|
||||
sibling = parent->rb_right;
|
||||
if (node != sibling) { /* node == parent->rb_left */
|
||||
if (rb_is_red(sibling)) {
|
||||
/*
|
||||
* Case 1 - left rotate at parent
|
||||
*
|
||||
* P S
|
||||
* / \ / \
|
||||
* N s --> p Sr
|
||||
* / \ / \
|
||||
* Sl Sr N Sl
|
||||
*/
|
||||
tmp1 = sibling->rb_left;
|
||||
WRITE_ONCE(parent->rb_right, tmp1);
|
||||
WRITE_ONCE(sibling->rb_left, parent);
|
||||
rb_set_parent_color(tmp1, parent, RB_BLACK);
|
||||
__rb_rotate_set_parents(parent, sibling, root,
|
||||
RB_RED);
|
||||
augment_rotate(parent, sibling);
|
||||
sibling = tmp1;
|
||||
}
|
||||
tmp1 = sibling->rb_right;
|
||||
if (!tmp1 || rb_is_black(tmp1)) {
|
||||
tmp2 = sibling->rb_left;
|
||||
if (!tmp2 || rb_is_black(tmp2)) {
|
||||
/*
|
||||
* Case 2 - sibling color flip
|
||||
* (p could be either color here)
|
||||
*
|
||||
* (p) (p)
|
||||
* / \ / \
|
||||
* N S --> N s
|
||||
* / \ / \
|
||||
* Sl Sr Sl Sr
|
||||
*
|
||||
* This leaves us violating 5) which
|
||||
* can be fixed by flipping p to black
|
||||
* if it was red, or by recursing at p.
|
||||
* p is red when coming from Case 1.
|
||||
*/
|
||||
rb_set_parent_color(sibling, parent,
|
||||
RB_RED);
|
||||
if (rb_is_red(parent))
|
||||
rb_set_black(parent);
|
||||
else {
|
||||
node = parent;
|
||||
parent = rb_parent(node);
|
||||
if (parent)
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
/*
|
||||
* Case 3 - right rotate at sibling
|
||||
* (p could be either color here)
|
||||
*
|
||||
* (p) (p)
|
||||
* / \ / \
|
||||
* N S --> N sl
|
||||
* / \ \
|
||||
* sl Sr S
|
||||
* \
|
||||
* Sr
|
||||
*
|
||||
* Note: p might be red, and then both
|
||||
* p and sl are red after rotation(which
|
||||
* breaks property 4). This is fixed in
|
||||
* Case 4 (in __rb_rotate_set_parents()
|
||||
* which set sl the color of p
|
||||
* and set p RB_BLACK)
|
||||
*
|
||||
* (p) (sl)
|
||||
* / \ / \
|
||||
* N sl --> P S
|
||||
* \ / \
|
||||
* S N Sr
|
||||
* \
|
||||
* Sr
|
||||
*/
|
||||
tmp1 = tmp2->rb_right;
|
||||
WRITE_ONCE(sibling->rb_left, tmp1);
|
||||
WRITE_ONCE(tmp2->rb_right, sibling);
|
||||
WRITE_ONCE(parent->rb_right, tmp2);
|
||||
if (tmp1)
|
||||
rb_set_parent_color(tmp1, sibling,
|
||||
RB_BLACK);
|
||||
augment_rotate(sibling, tmp2);
|
||||
tmp1 = sibling;
|
||||
sibling = tmp2;
|
||||
}
|
||||
/*
|
||||
* Case 4 - left rotate at parent + color flips
|
||||
* (p and sl could be either color here.
|
||||
* After rotation, p becomes black, s acquires
|
||||
* p's color, and sl keeps its color)
|
||||
*
|
||||
* (p) (s)
|
||||
* / \ / \
|
||||
* N S --> P Sr
|
||||
* / \ / \
|
||||
* (sl) sr N (sl)
|
||||
*/
|
||||
tmp2 = sibling->rb_left;
|
||||
WRITE_ONCE(parent->rb_right, tmp2);
|
||||
WRITE_ONCE(sibling->rb_left, parent);
|
||||
rb_set_parent_color(tmp1, sibling, RB_BLACK);
|
||||
if (tmp2)
|
||||
rb_set_parent(tmp2, parent);
|
||||
__rb_rotate_set_parents(parent, sibling, root,
|
||||
RB_BLACK);
|
||||
augment_rotate(parent, sibling);
|
||||
break;
|
||||
} else {
|
||||
sibling = parent->rb_left;
|
||||
if (rb_is_red(sibling)) {
|
||||
/* Case 1 - right rotate at parent */
|
||||
tmp1 = sibling->rb_right;
|
||||
WRITE_ONCE(parent->rb_left, tmp1);
|
||||
WRITE_ONCE(sibling->rb_right, parent);
|
||||
rb_set_parent_color(tmp1, parent, RB_BLACK);
|
||||
__rb_rotate_set_parents(parent, sibling, root,
|
||||
RB_RED);
|
||||
augment_rotate(parent, sibling);
|
||||
sibling = tmp1;
|
||||
}
|
||||
tmp1 = sibling->rb_left;
|
||||
if (!tmp1 || rb_is_black(tmp1)) {
|
||||
tmp2 = sibling->rb_right;
|
||||
if (!tmp2 || rb_is_black(tmp2)) {
|
||||
/* Case 2 - sibling color flip */
|
||||
rb_set_parent_color(sibling, parent,
|
||||
RB_RED);
|
||||
if (rb_is_red(parent))
|
||||
rb_set_black(parent);
|
||||
else {
|
||||
node = parent;
|
||||
parent = rb_parent(node);
|
||||
if (parent)
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
/* Case 3 - left rotate at sibling */
|
||||
tmp1 = tmp2->rb_left;
|
||||
WRITE_ONCE(sibling->rb_right, tmp1);
|
||||
WRITE_ONCE(tmp2->rb_left, sibling);
|
||||
WRITE_ONCE(parent->rb_left, tmp2);
|
||||
if (tmp1)
|
||||
rb_set_parent_color(tmp1, sibling,
|
||||
RB_BLACK);
|
||||
augment_rotate(sibling, tmp2);
|
||||
tmp1 = sibling;
|
||||
sibling = tmp2;
|
||||
}
|
||||
/* Case 4 - right rotate at parent + color flips */
|
||||
tmp2 = sibling->rb_right;
|
||||
WRITE_ONCE(parent->rb_left, tmp2);
|
||||
WRITE_ONCE(sibling->rb_right, parent);
|
||||
rb_set_parent_color(tmp1, sibling, RB_BLACK);
|
||||
if (tmp2)
|
||||
rb_set_parent(tmp2, parent);
|
||||
__rb_rotate_set_parents(parent, sibling, root,
|
||||
RB_BLACK);
|
||||
augment_rotate(parent, sibling);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Non-inline version for rb_erase_augmented() use */
|
||||
void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
|
||||
{
|
||||
____rb_erase_color(parent, root, augment_rotate);
|
||||
}
|
||||
EXPORT_SYMBOL(__rb_erase_color);
|
||||
|
||||
/*
|
||||
* Non-augmented rbtree manipulation functions.
|
||||
*
|
||||
* We use dummy augmented callbacks here, and have the compiler optimize them
|
||||
* out of the rb_insert_color() and rb_erase() function definitions.
|
||||
*/
|
||||
|
||||
static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
|
||||
static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
|
||||
static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
|
||||
|
||||
static const struct rb_augment_callbacks dummy_callbacks = {
|
||||
.propagate = dummy_propagate,
|
||||
.copy = dummy_copy,
|
||||
.rotate = dummy_rotate
|
||||
};
|
||||
|
||||
void rb_insert_color(struct rb_node *node, struct rb_root *root)
|
||||
{
|
||||
__rb_insert(node, root, dummy_rotate);
|
||||
}
|
||||
EXPORT_SYMBOL(rb_insert_color);
|
||||
|
||||
void rb_erase(struct rb_node *node, struct rb_root *root)
|
||||
{
|
||||
struct rb_node *rebalance;
|
||||
rebalance = __rb_erase_augmented(node, root, &dummy_callbacks);
|
||||
if (rebalance)
|
||||
____rb_erase_color(rebalance, root, dummy_rotate);
|
||||
}
|
||||
EXPORT_SYMBOL(rb_erase);
|
||||
|
||||
/*
|
||||
* Augmented rbtree manipulation functions.
|
||||
*
|
||||
* This instantiates the same __always_inline functions as in the non-augmented
|
||||
* case, but this time with user-defined callbacks.
|
||||
*/
|
||||
|
||||
void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new))
|
||||
{
|
||||
__rb_insert(node, root, augment_rotate);
|
||||
}
|
||||
EXPORT_SYMBOL(__rb_insert_augmented);
|
||||
|
||||
/*
|
||||
* This function returns the first node (in sort order) of the tree.
|
||||
*/
|
||||
struct rb_node *rb_first(const struct rb_root *root)
|
||||
{
|
||||
struct rb_node *n;
|
||||
|
||||
n = root->rb_node;
|
||||
if (!n)
|
||||
return NULL;
|
||||
while (n->rb_left)
|
||||
n = n->rb_left;
|
||||
return n;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_first);
|
||||
|
||||
struct rb_node *rb_last(const struct rb_root *root)
|
||||
{
|
||||
struct rb_node *n;
|
||||
|
||||
n = root->rb_node;
|
||||
if (!n)
|
||||
return NULL;
|
||||
while (n->rb_right)
|
||||
n = n->rb_right;
|
||||
return n;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_last);
|
||||
|
||||
struct rb_node *rb_next(const struct rb_node *node)
|
||||
{
|
||||
struct rb_node *parent;
|
||||
|
||||
if (RB_EMPTY_NODE(node))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* If we have a right-hand child, go down and then left as far
|
||||
* as we can.
|
||||
*/
|
||||
if (node->rb_right) {
|
||||
node = node->rb_right;
|
||||
while (node->rb_left)
|
||||
node = node->rb_left;
|
||||
return (struct rb_node *)node;
|
||||
}
|
||||
|
||||
/*
|
||||
* No right-hand children. Everything down and left is smaller than us,
|
||||
* so any 'next' node must be in the general direction of our parent.
|
||||
* Go up the tree; any time the ancestor is a right-hand child of its
|
||||
* parent, keep going up. First time it's a left-hand child of its
|
||||
* parent, said parent is our 'next' node.
|
||||
*/
|
||||
while ((parent = rb_parent(node)) && node == parent->rb_right)
|
||||
node = parent;
|
||||
|
||||
return parent;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_next);
|
||||
|
||||
struct rb_node *rb_prev(const struct rb_node *node)
|
||||
{
|
||||
struct rb_node *parent;
|
||||
|
||||
if (RB_EMPTY_NODE(node))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* If we have a left-hand child, go down and then right as far
|
||||
* as we can.
|
||||
*/
|
||||
if (node->rb_left) {
|
||||
node = node->rb_left;
|
||||
while (node->rb_right)
|
||||
node = node->rb_right;
|
||||
return (struct rb_node *)node;
|
||||
}
|
||||
|
||||
/*
|
||||
* No left-hand children. Go up till we find an ancestor which
|
||||
* is a right-hand child of its parent.
|
||||
*/
|
||||
while ((parent = rb_parent(node)) && node == parent->rb_left)
|
||||
node = parent;
|
||||
|
||||
return parent;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_prev);
|
||||
|
||||
void rb_replace_node(struct rb_node *victim, struct rb_node *new,
|
||||
struct rb_root *root)
|
||||
{
|
||||
struct rb_node *parent = rb_parent(victim);
|
||||
|
||||
/* Copy the pointers/colour from the victim to the replacement */
|
||||
*new = *victim;
|
||||
|
||||
/* Set the surrounding nodes to point to the replacement */
|
||||
if (victim->rb_left)
|
||||
rb_set_parent(victim->rb_left, new);
|
||||
if (victim->rb_right)
|
||||
rb_set_parent(victim->rb_right, new);
|
||||
__rb_change_child(victim, new, parent, root);
|
||||
}
|
||||
EXPORT_SYMBOL(rb_replace_node);
|
||||
|
||||
void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
|
||||
struct rb_root *root)
|
||||
{
|
||||
struct rb_node *parent = rb_parent(victim);
|
||||
|
||||
/* Copy the pointers/colour from the victim to the replacement */
|
||||
*new = *victim;
|
||||
|
||||
/* Set the surrounding nodes to point to the replacement */
|
||||
if (victim->rb_left)
|
||||
rb_set_parent(victim->rb_left, new);
|
||||
if (victim->rb_right)
|
||||
rb_set_parent(victim->rb_right, new);
|
||||
|
||||
/* Set the parent's pointer to the new node last after an RCU barrier
|
||||
* so that the pointers onwards are seen to be set correctly when doing
|
||||
* an RCU walk over the tree.
|
||||
*/
|
||||
__rb_change_child_rcu(victim, new, parent, root);
|
||||
}
|
||||
EXPORT_SYMBOL(rb_replace_node_rcu);
|
||||
|
||||
static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
|
||||
{
|
||||
for (;;) {
|
||||
if (node->rb_left)
|
||||
node = node->rb_left;
|
||||
else if (node->rb_right)
|
||||
node = node->rb_right;
|
||||
else
|
||||
return (struct rb_node *)node;
|
||||
}
|
||||
}
|
||||
|
||||
struct rb_node *rb_next_postorder(const struct rb_node *node)
|
||||
{
|
||||
const struct rb_node *parent;
|
||||
if (!node)
|
||||
return NULL;
|
||||
parent = rb_parent(node);
|
||||
|
||||
/* If we're sitting on node, we've already seen our children */
|
||||
if (parent && node == parent->rb_left && parent->rb_right) {
|
||||
/* If we are the parent's left node, go to the parent's right
|
||||
* node then all the way down to the left */
|
||||
return rb_left_deepest_node(parent->rb_right);
|
||||
} else
|
||||
/* Otherwise we are the parent's right node, and the parent
|
||||
* should be next */
|
||||
return (struct rb_node *)parent;
|
||||
}
|
||||
EXPORT_SYMBOL(rb_next_postorder);
|
||||
|
||||
struct rb_node *rb_first_postorder(const struct rb_root *root)
|
||||
{
|
||||
if (!root->rb_node)
|
||||
return NULL;
|
||||
|
||||
return rb_left_deepest_node(root->rb_node);
|
||||
}
|
||||
EXPORT_SYMBOL(rb_first_postorder);
|
||||
328
utils/src/rbtree.h
Normal file
328
utils/src/rbtree.h
Normal file
@@ -0,0 +1,328 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
/*
|
||||
Red Black Trees
|
||||
(C) 1999 Andrea Arcangeli <andrea@suse.de>
|
||||
|
||||
|
||||
linux/include/linux/rbtree.h
|
||||
|
||||
To use rbtrees you'll have to implement your own insert and search cores.
|
||||
This will avoid us to use callbacks and to drop drammatically performances.
|
||||
I know it's not the cleaner way, but in C (not in C++) to get
|
||||
performances and genericity...
|
||||
|
||||
See Documentation/core-api/rbtree.rst for documentation and samples.
|
||||
*/
|
||||
|
||||
#ifndef _LINUX_RBTREE_H
|
||||
#define _LINUX_RBTREE_H
|
||||
|
||||
#define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3))
|
||||
|
||||
#define rb_entry(ptr, type, member) container_of(ptr, type, member)
|
||||
|
||||
#define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL)
|
||||
|
||||
/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
|
||||
#define RB_EMPTY_NODE(node) \
|
||||
((node)->__rb_parent_color == (unsigned long)(node))
|
||||
#define RB_CLEAR_NODE(node) \
|
||||
((node)->__rb_parent_color = (unsigned long)(node))
|
||||
|
||||
|
||||
extern void rb_insert_color(struct rb_node *, struct rb_root *);
|
||||
extern void rb_erase(struct rb_node *, struct rb_root *);
|
||||
|
||||
|
||||
/* Find logical next and previous nodes in a tree */
|
||||
extern struct rb_node *rb_next(const struct rb_node *);
|
||||
extern struct rb_node *rb_prev(const struct rb_node *);
|
||||
extern struct rb_node *rb_first(const struct rb_root *);
|
||||
extern struct rb_node *rb_last(const struct rb_root *);
|
||||
|
||||
/* Postorder iteration - always visit the parent after its children */
|
||||
extern struct rb_node *rb_first_postorder(const struct rb_root *);
|
||||
extern struct rb_node *rb_next_postorder(const struct rb_node *);
|
||||
|
||||
/* Fast replacement of a single node without remove/rebalance/add/rebalance */
|
||||
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
|
||||
struct rb_root *root);
|
||||
extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
|
||||
struct rb_root *root);
|
||||
|
||||
static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
|
||||
struct rb_node **rb_link)
|
||||
{
|
||||
node->__rb_parent_color = (unsigned long)parent;
|
||||
node->rb_left = node->rb_right = NULL;
|
||||
|
||||
*rb_link = node;
|
||||
}
|
||||
|
||||
static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
|
||||
struct rb_node **rb_link)
|
||||
{
|
||||
node->__rb_parent_color = (unsigned long)parent;
|
||||
node->rb_left = node->rb_right = NULL;
|
||||
|
||||
rcu_assign_pointer(*rb_link, node);
|
||||
}
|
||||
|
||||
#define rb_entry_safe(ptr, type, member) \
|
||||
({ typeof(ptr) ____ptr = (ptr); \
|
||||
____ptr ? rb_entry(____ptr, type, member) : NULL; \
|
||||
})
|
||||
|
||||
/**
|
||||
* rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
|
||||
* given type allowing the backing memory of @pos to be invalidated
|
||||
*
|
||||
* @pos: the 'type *' to use as a loop cursor.
|
||||
* @n: another 'type *' to use as temporary storage
|
||||
* @root: 'rb_root *' of the rbtree.
|
||||
* @field: the name of the rb_node field within 'type'.
|
||||
*
|
||||
* rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
|
||||
* list_for_each_entry_safe() and allows the iteration to continue independent
|
||||
* of changes to @pos by the body of the loop.
|
||||
*
|
||||
* Note, however, that it cannot handle other modifications that re-order the
|
||||
* rbtree it is iterating over. This includes calling rb_erase() on @pos, as
|
||||
* rb_erase() may rebalance the tree, causing us to miss some nodes.
|
||||
*/
|
||||
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
|
||||
for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
|
||||
pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
|
||||
typeof(*pos), field); 1; }); \
|
||||
pos = n)
|
||||
|
||||
/* Same as rb_first(), but O(1) */
|
||||
#define rb_first_cached(root) (root)->rb_leftmost
|
||||
|
||||
static inline void rb_insert_color_cached(struct rb_node *node,
|
||||
struct rb_root_cached *root,
|
||||
bool leftmost)
|
||||
{
|
||||
if (leftmost)
|
||||
root->rb_leftmost = node;
|
||||
rb_insert_color(node, &root->rb_root);
|
||||
}
|
||||
|
||||
|
||||
static inline struct rb_node *
|
||||
rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
|
||||
{
|
||||
struct rb_node *leftmost = NULL;
|
||||
|
||||
if (root->rb_leftmost == node)
|
||||
leftmost = root->rb_leftmost = rb_next(node);
|
||||
|
||||
rb_erase(node, &root->rb_root);
|
||||
|
||||
return leftmost;
|
||||
}
|
||||
|
||||
static inline void rb_replace_node_cached(struct rb_node *victim,
|
||||
struct rb_node *new,
|
||||
struct rb_root_cached *root)
|
||||
{
|
||||
if (root->rb_leftmost == victim)
|
||||
root->rb_leftmost = new;
|
||||
rb_replace_node(victim, new, &root->rb_root);
|
||||
}
|
||||
|
||||
/*
|
||||
* The below helper functions use 2 operators with 3 different
|
||||
* calling conventions. The operators are related like:
|
||||
*
|
||||
* comp(a->key,b) < 0 := less(a,b)
|
||||
* comp(a->key,b) > 0 := less(b,a)
|
||||
* comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
|
||||
*
|
||||
* If these operators define a partial order on the elements we make no
|
||||
* guarantee on which of the elements matching the key is found. See
|
||||
* rb_find().
|
||||
*
|
||||
* The reason for this is to allow the find() interface without requiring an
|
||||
* on-stack dummy object, which might not be feasible due to object size.
|
||||
*/
|
||||
|
||||
/**
|
||||
* rb_add_cached() - insert @node into the leftmost cached tree @tree
|
||||
* @node: node to insert
|
||||
* @tree: leftmost cached tree to insert @node into
|
||||
* @less: operator defining the (partial) node order
|
||||
*
|
||||
* Returns @node when it is the new leftmost, or NULL.
|
||||
*/
|
||||
static __always_inline struct rb_node *
|
||||
rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
|
||||
bool (*less)(struct rb_node *, const struct rb_node *))
|
||||
{
|
||||
struct rb_node **link = &tree->rb_root.rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
bool leftmost = true;
|
||||
|
||||
while (*link) {
|
||||
parent = *link;
|
||||
if (less(node, parent)) {
|
||||
link = &parent->rb_left;
|
||||
} else {
|
||||
link = &parent->rb_right;
|
||||
leftmost = false;
|
||||
}
|
||||
}
|
||||
|
||||
rb_link_node(node, parent, link);
|
||||
rb_insert_color_cached(node, tree, leftmost);
|
||||
|
||||
return leftmost ? node : NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* rb_add() - insert @node into @tree
|
||||
* @node: node to insert
|
||||
* @tree: tree to insert @node into
|
||||
* @less: operator defining the (partial) node order
|
||||
*/
|
||||
static __always_inline void
|
||||
rb_add(struct rb_node *node, struct rb_root *tree,
|
||||
bool (*less)(struct rb_node *, const struct rb_node *))
|
||||
{
|
||||
struct rb_node **link = &tree->rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
|
||||
while (*link) {
|
||||
parent = *link;
|
||||
if (less(node, parent))
|
||||
link = &parent->rb_left;
|
||||
else
|
||||
link = &parent->rb_right;
|
||||
}
|
||||
|
||||
rb_link_node(node, parent, link);
|
||||
rb_insert_color(node, tree);
|
||||
}
|
||||
|
||||
/**
|
||||
* rb_find_add() - find equivalent @node in @tree, or add @node
|
||||
* @node: node to look-for / insert
|
||||
* @tree: tree to search / modify
|
||||
* @cmp: operator defining the node order
|
||||
*
|
||||
* Returns the rb_node matching @node, or NULL when no match is found and @node
|
||||
* is inserted.
|
||||
*/
|
||||
static __always_inline struct rb_node *
|
||||
rb_find_add(struct rb_node *node, struct rb_root *tree,
|
||||
int (*cmp)(struct rb_node *, const struct rb_node *))
|
||||
{
|
||||
struct rb_node **link = &tree->rb_node;
|
||||
struct rb_node *parent = NULL;
|
||||
int c;
|
||||
|
||||
while (*link) {
|
||||
parent = *link;
|
||||
c = cmp(node, parent);
|
||||
|
||||
if (c < 0)
|
||||
link = &parent->rb_left;
|
||||
else if (c > 0)
|
||||
link = &parent->rb_right;
|
||||
else
|
||||
return parent;
|
||||
}
|
||||
|
||||
rb_link_node(node, parent, link);
|
||||
rb_insert_color(node, tree);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* rb_find() - find @key in tree @tree
|
||||
* @key: key to match
|
||||
* @tree: tree to search
|
||||
* @cmp: operator defining the node order
|
||||
*
|
||||
* Returns the rb_node matching @key or NULL.
|
||||
*/
|
||||
static __always_inline struct rb_node *
|
||||
rb_find(const void *key, const struct rb_root *tree,
|
||||
int (*cmp)(const void *key, const struct rb_node *))
|
||||
{
|
||||
struct rb_node *node = tree->rb_node;
|
||||
|
||||
while (node) {
|
||||
int c = cmp(key, node);
|
||||
|
||||
if (c < 0)
|
||||
node = node->rb_left;
|
||||
else if (c > 0)
|
||||
node = node->rb_right;
|
||||
else
|
||||
return node;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* rb_find_first() - find the first @key in @tree
|
||||
* @key: key to match
|
||||
* @tree: tree to search
|
||||
* @cmp: operator defining node order
|
||||
*
|
||||
* Returns the leftmost node matching @key, or NULL.
|
||||
*/
|
||||
static __always_inline struct rb_node *
|
||||
rb_find_first(const void *key, const struct rb_root *tree,
|
||||
int (*cmp)(const void *key, const struct rb_node *))
|
||||
{
|
||||
struct rb_node *node = tree->rb_node;
|
||||
struct rb_node *match = NULL;
|
||||
|
||||
while (node) {
|
||||
int c = cmp(key, node);
|
||||
|
||||
if (c <= 0) {
|
||||
if (!c)
|
||||
match = node;
|
||||
node = node->rb_left;
|
||||
} else if (c > 0) {
|
||||
node = node->rb_right;
|
||||
}
|
||||
}
|
||||
|
||||
return match;
|
||||
}
|
||||
|
||||
/**
|
||||
* rb_next_match() - find the next @key in @tree
|
||||
* @key: key to match
|
||||
* @tree: tree to search
|
||||
* @cmp: operator defining node order
|
||||
*
|
||||
* Returns the next node matching @key, or NULL.
|
||||
*/
|
||||
static __always_inline struct rb_node *
|
||||
rb_next_match(const void *key, struct rb_node *node,
|
||||
int (*cmp)(const void *key, const struct rb_node *))
|
||||
{
|
||||
node = rb_next(node);
|
||||
if (node && cmp(key, node))
|
||||
node = NULL;
|
||||
return node;
|
||||
}
|
||||
|
||||
/**
|
||||
* rb_for_each() - iterates a subtree matching @key
|
||||
* @node: iterator
|
||||
* @key: key to match
|
||||
* @tree: tree to search
|
||||
* @cmp: operator defining node order
|
||||
*/
|
||||
#define rb_for_each(node, key, tree, cmp) \
|
||||
for ((node) = rb_find_first((key), (tree), (cmp)); \
|
||||
(node); (node) = rb_next_match((key), (node), (cmp)))
|
||||
|
||||
#endif /* _LINUX_RBTREE_H */
|
||||
313
utils/src/rbtree_augmented.h
Normal file
313
utils/src/rbtree_augmented.h
Normal file
@@ -0,0 +1,313 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
/*
|
||||
Red Black Trees
|
||||
(C) 1999 Andrea Arcangeli <andrea@suse.de>
|
||||
(C) 2002 David Woodhouse <dwmw2@infradead.org>
|
||||
(C) 2012 Michel Lespinasse <walken@google.com>
|
||||
|
||||
|
||||
linux/include/linux/rbtree_augmented.h
|
||||
*/
|
||||
|
||||
#ifndef _LINUX_RBTREE_AUGMENTED_H
|
||||
#define _LINUX_RBTREE_AUGMENTED_H
|
||||
|
||||
/*
|
||||
* Please note - only struct rb_augment_callbacks and the prototypes for
|
||||
* rb_insert_augmented() and rb_erase_augmented() are intended to be public.
|
||||
* The rest are implementation details you are not expected to depend on.
|
||||
*
|
||||
* See Documentation/core-api/rbtree.rst for documentation and samples.
|
||||
*/
|
||||
|
||||
struct rb_augment_callbacks {
|
||||
void (*propagate)(struct rb_node *node, struct rb_node *stop);
|
||||
void (*copy)(struct rb_node *old, struct rb_node *new);
|
||||
void (*rotate)(struct rb_node *old, struct rb_node *new);
|
||||
};
|
||||
|
||||
extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
|
||||
|
||||
/*
|
||||
* Fixup the rbtree and update the augmented information when rebalancing.
|
||||
*
|
||||
* On insertion, the user must update the augmented information on the path
|
||||
* leading to the inserted node, then call rb_link_node() as usual and
|
||||
* rb_insert_augmented() instead of the usual rb_insert_color() call.
|
||||
* If rb_insert_augmented() rebalances the rbtree, it will callback into
|
||||
* a user provided function to update the augmented information on the
|
||||
* affected subtrees.
|
||||
*/
|
||||
static inline void
|
||||
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
|
||||
const struct rb_augment_callbacks *augment)
|
||||
{
|
||||
__rb_insert_augmented(node, root, augment->rotate);
|
||||
}
|
||||
|
||||
static inline void
|
||||
rb_insert_augmented_cached(struct rb_node *node,
|
||||
struct rb_root_cached *root, bool newleft,
|
||||
const struct rb_augment_callbacks *augment)
|
||||
{
|
||||
if (newleft)
|
||||
root->rb_leftmost = node;
|
||||
rb_insert_augmented(node, &root->rb_root, augment);
|
||||
}
|
||||
|
||||
/*
|
||||
* Template for declaring augmented rbtree callbacks (generic case)
|
||||
*
|
||||
* RBSTATIC: 'static' or empty
|
||||
* RBNAME: name of the rb_augment_callbacks structure
|
||||
* RBSTRUCT: struct type of the tree nodes
|
||||
* RBFIELD: name of struct rb_node field within RBSTRUCT
|
||||
* RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
|
||||
* RBCOMPUTE: name of function that recomputes the RBAUGMENTED data
|
||||
*/
|
||||
|
||||
#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \
|
||||
RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \
|
||||
static inline void \
|
||||
RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \
|
||||
{ \
|
||||
while (rb != stop) { \
|
||||
RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \
|
||||
if (RBCOMPUTE(node, true)) \
|
||||
break; \
|
||||
rb = rb_parent(&node->RBFIELD); \
|
||||
} \
|
||||
} \
|
||||
static inline void \
|
||||
RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \
|
||||
{ \
|
||||
RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \
|
||||
RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \
|
||||
new->RBAUGMENTED = old->RBAUGMENTED; \
|
||||
} \
|
||||
static void \
|
||||
RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \
|
||||
{ \
|
||||
RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \
|
||||
RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \
|
||||
new->RBAUGMENTED = old->RBAUGMENTED; \
|
||||
RBCOMPUTE(old, false); \
|
||||
} \
|
||||
RBSTATIC const struct rb_augment_callbacks RBNAME = { \
|
||||
.propagate = RBNAME ## _propagate, \
|
||||
.copy = RBNAME ## _copy, \
|
||||
.rotate = RBNAME ## _rotate \
|
||||
};
|
||||
|
||||
/*
|
||||
* Template for declaring augmented rbtree callbacks,
|
||||
* computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
|
||||
*
|
||||
* RBSTATIC: 'static' or empty
|
||||
* RBNAME: name of the rb_augment_callbacks structure
|
||||
* RBSTRUCT: struct type of the tree nodes
|
||||
* RBFIELD: name of struct rb_node field within RBSTRUCT
|
||||
* RBTYPE: type of the RBAUGMENTED field
|
||||
* RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
|
||||
* RBCOMPUTE: name of function that returns the per-node RBTYPE scalar
|
||||
*/
|
||||
|
||||
#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \
|
||||
RBTYPE, RBAUGMENTED, RBCOMPUTE) \
|
||||
static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \
|
||||
{ \
|
||||
RBSTRUCT *child; \
|
||||
RBTYPE max = RBCOMPUTE(node); \
|
||||
if (node->RBFIELD.rb_left) { \
|
||||
child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \
|
||||
if (child->RBAUGMENTED > max) \
|
||||
max = child->RBAUGMENTED; \
|
||||
} \
|
||||
if (node->RBFIELD.rb_right) { \
|
||||
child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \
|
||||
if (child->RBAUGMENTED > max) \
|
||||
max = child->RBAUGMENTED; \
|
||||
} \
|
||||
if (exit && node->RBAUGMENTED == max) \
|
||||
return true; \
|
||||
node->RBAUGMENTED = max; \
|
||||
return false; \
|
||||
} \
|
||||
RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \
|
||||
RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)
|
||||
|
||||
|
||||
#define RB_RED 0
|
||||
#define RB_BLACK 1
|
||||
|
||||
#define __rb_parent(pc) ((struct rb_node *)(pc & ~3))
|
||||
|
||||
#define __rb_color(pc) ((pc) & 1)
|
||||
#define __rb_is_black(pc) __rb_color(pc)
|
||||
#define __rb_is_red(pc) (!__rb_color(pc))
|
||||
#define rb_color(rb) __rb_color((rb)->__rb_parent_color)
|
||||
#define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color)
|
||||
#define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color)
|
||||
|
||||
static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
|
||||
{
|
||||
rb->__rb_parent_color = rb_color(rb) | (unsigned long)p;
|
||||
}
|
||||
|
||||
static inline void rb_set_parent_color(struct rb_node *rb,
|
||||
struct rb_node *p, int color)
|
||||
{
|
||||
rb->__rb_parent_color = (unsigned long)p | color;
|
||||
}
|
||||
|
||||
static inline void
|
||||
__rb_change_child(struct rb_node *old, struct rb_node *new,
|
||||
struct rb_node *parent, struct rb_root *root)
|
||||
{
|
||||
if (parent) {
|
||||
if (parent->rb_left == old)
|
||||
WRITE_ONCE(parent->rb_left, new);
|
||||
else
|
||||
WRITE_ONCE(parent->rb_right, new);
|
||||
} else
|
||||
WRITE_ONCE(root->rb_node, new);
|
||||
}
|
||||
|
||||
static inline void
|
||||
__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
|
||||
struct rb_node *parent, struct rb_root *root)
|
||||
{
|
||||
if (parent) {
|
||||
if (parent->rb_left == old)
|
||||
rcu_assign_pointer(parent->rb_left, new);
|
||||
else
|
||||
rcu_assign_pointer(parent->rb_right, new);
|
||||
} else
|
||||
rcu_assign_pointer(root->rb_node, new);
|
||||
}
|
||||
|
||||
extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
|
||||
void (*augment_rotate)(struct rb_node *old, struct rb_node *new));
|
||||
|
||||
static __always_inline struct rb_node *
|
||||
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
|
||||
const struct rb_augment_callbacks *augment)
|
||||
{
|
||||
struct rb_node *child = node->rb_right;
|
||||
struct rb_node *tmp = node->rb_left;
|
||||
struct rb_node *parent, *rebalance;
|
||||
unsigned long pc;
|
||||
|
||||
if (!tmp) {
|
||||
/*
|
||||
* Case 1: node to erase has no more than 1 child (easy!)
|
||||
*
|
||||
* Note that if there is one child it must be red due to 5)
|
||||
* and node must be black due to 4). We adjust colors locally
|
||||
* so as to bypass __rb_erase_color() later on.
|
||||
*/
|
||||
pc = node->__rb_parent_color;
|
||||
parent = __rb_parent(pc);
|
||||
__rb_change_child(node, child, parent, root);
|
||||
if (child) {
|
||||
child->__rb_parent_color = pc;
|
||||
rebalance = NULL;
|
||||
} else
|
||||
rebalance = __rb_is_black(pc) ? parent : NULL;
|
||||
tmp = parent;
|
||||
} else if (!child) {
|
||||
/* Still case 1, but this time the child is node->rb_left */
|
||||
tmp->__rb_parent_color = pc = node->__rb_parent_color;
|
||||
parent = __rb_parent(pc);
|
||||
__rb_change_child(node, tmp, parent, root);
|
||||
rebalance = NULL;
|
||||
tmp = parent;
|
||||
} else {
|
||||
struct rb_node *successor = child, *child2;
|
||||
|
||||
tmp = child->rb_left;
|
||||
if (!tmp) {
|
||||
/*
|
||||
* Case 2: node's successor is its right child
|
||||
*
|
||||
* (n) (s)
|
||||
* / \ / \
|
||||
* (x) (s) -> (x) (c)
|
||||
* \
|
||||
* (c)
|
||||
*/
|
||||
parent = successor;
|
||||
child2 = successor->rb_right;
|
||||
|
||||
augment->copy(node, successor);
|
||||
} else {
|
||||
/*
|
||||
* Case 3: node's successor is leftmost under
|
||||
* node's right child subtree
|
||||
*
|
||||
* (n) (s)
|
||||
* / \ / \
|
||||
* (x) (y) -> (x) (y)
|
||||
* / /
|
||||
* (p) (p)
|
||||
* / /
|
||||
* (s) (c)
|
||||
* \
|
||||
* (c)
|
||||
*/
|
||||
do {
|
||||
parent = successor;
|
||||
successor = tmp;
|
||||
tmp = tmp->rb_left;
|
||||
} while (tmp);
|
||||
child2 = successor->rb_right;
|
||||
WRITE_ONCE(parent->rb_left, child2);
|
||||
WRITE_ONCE(successor->rb_right, child);
|
||||
rb_set_parent(child, successor);
|
||||
|
||||
augment->copy(node, successor);
|
||||
augment->propagate(parent, successor);
|
||||
}
|
||||
|
||||
tmp = node->rb_left;
|
||||
WRITE_ONCE(successor->rb_left, tmp);
|
||||
rb_set_parent(tmp, successor);
|
||||
|
||||
pc = node->__rb_parent_color;
|
||||
tmp = __rb_parent(pc);
|
||||
__rb_change_child(node, successor, tmp, root);
|
||||
|
||||
if (child2) {
|
||||
rb_set_parent_color(child2, parent, RB_BLACK);
|
||||
rebalance = NULL;
|
||||
} else {
|
||||
rebalance = rb_is_black(successor) ? parent : NULL;
|
||||
}
|
||||
successor->__rb_parent_color = pc;
|
||||
tmp = successor;
|
||||
}
|
||||
|
||||
augment->propagate(tmp, NULL);
|
||||
return rebalance;
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
|
||||
const struct rb_augment_callbacks *augment)
|
||||
{
|
||||
struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
|
||||
if (rebalance)
|
||||
__rb_erase_color(rebalance, root, augment->rotate);
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
|
||||
const struct rb_augment_callbacks *augment)
|
||||
{
|
||||
if (root->rb_leftmost == node)
|
||||
root->rb_leftmost = rb_next(node);
|
||||
rb_erase_augmented(node, &root->rb_root, augment);
|
||||
}
|
||||
|
||||
#endif /* _LINUX_RBTREE_AUGMENTED_H */
|
||||
34
utils/src/rbtree_types.h
Normal file
34
utils/src/rbtree_types.h
Normal file
@@ -0,0 +1,34 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#ifndef _LINUX_RBTREE_TYPES_H
|
||||
#define _LINUX_RBTREE_TYPES_H
|
||||
|
||||
struct rb_node {
|
||||
unsigned long __rb_parent_color;
|
||||
struct rb_node *rb_right;
|
||||
struct rb_node *rb_left;
|
||||
} __attribute__((aligned(sizeof(long))));
|
||||
/* The alignment might seem pointless, but allegedly CRIS needs it */
|
||||
|
||||
struct rb_root {
|
||||
struct rb_node *rb_node;
|
||||
};
|
||||
|
||||
/*
|
||||
* Leftmost-cached rbtrees.
|
||||
*
|
||||
* We do not cache the rightmost node based on footprint
|
||||
* size vs number of potential users that could benefit
|
||||
* from O(1) rb_last(). Just not worth it, users that want
|
||||
* this feature can always implement the logic explicitly.
|
||||
* Furthermore, users that want to cache both pointers may
|
||||
* find it a bit asymmetric, but that's ok.
|
||||
*/
|
||||
struct rb_root_cached {
|
||||
struct rb_root rb_root;
|
||||
struct rb_node *rb_leftmost;
|
||||
};
|
||||
|
||||
#define RB_ROOT (struct rb_root) { NULL, }
|
||||
#define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL }
|
||||
|
||||
#endif
|
||||
@@ -44,3 +44,37 @@ int srch_decode_entry(void *buf, struct scoutfs_srch_entry *sre,
|
||||
|
||||
return tot;
|
||||
}
|
||||
|
||||
static int encode_u64(__le64 *buf, u64 val)
|
||||
{
|
||||
int bytes;
|
||||
|
||||
val = (val << 1) ^ ((s64)val >> 63); /* shift sign extend */
|
||||
bytes = (fls64(val) + 7) >> 3;
|
||||
|
||||
put_unaligned_le64(val, buf);
|
||||
return bytes;
|
||||
}
|
||||
|
||||
int srch_encode_entry(void *buf, struct scoutfs_srch_entry *sre, struct scoutfs_srch_entry *prev)
|
||||
{
|
||||
u64 diffs[] = {
|
||||
le64_to_cpu(sre->hash) - le64_to_cpu(prev->hash),
|
||||
le64_to_cpu(sre->ino) - le64_to_cpu(prev->ino),
|
||||
le64_to_cpu(sre->id) - le64_to_cpu(prev->id),
|
||||
};
|
||||
u16 lengths = 0;
|
||||
int bytes;
|
||||
int tot = 2;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < array_size(diffs); i++) {
|
||||
bytes = encode_u64(buf + tot, diffs[i]);
|
||||
lengths |= bytes << (i << 2);
|
||||
tot += bytes;
|
||||
}
|
||||
|
||||
put_unaligned_le16(lengths, buf);
|
||||
|
||||
return tot;
|
||||
}
|
||||
|
||||
@@ -3,5 +3,6 @@
|
||||
|
||||
int srch_decode_entry(void *buf, struct scoutfs_srch_entry *sre,
|
||||
struct scoutfs_srch_entry *prev);
|
||||
int srch_encode_entry(void *buf, struct scoutfs_srch_entry *sre, struct scoutfs_srch_entry *prev);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -70,6 +70,8 @@ do { \
|
||||
#define container_of(ptr, type, memb) \
|
||||
((type *)((void *)(ptr) - offsetof(type, memb)))
|
||||
|
||||
#define NSEC_PER_SEC 1000000000
|
||||
|
||||
#define BITS_PER_LONG (sizeof(long) * 8)
|
||||
#define U8_MAX ((u8)~0ULL)
|
||||
#define U16_MAX ((u16)~0ULL)
|
||||
@@ -82,6 +84,7 @@ do { \
|
||||
\
|
||||
(_x == 0 ? 0 : 64 - __builtin_clzll(_x)); \
|
||||
})
|
||||
#define fls64(x) flsll(x)
|
||||
|
||||
#define ilog2(x) \
|
||||
({ \
|
||||
@@ -99,6 +102,16 @@ emit_get_unaligned_le(16)
|
||||
emit_get_unaligned_le(32)
|
||||
emit_get_unaligned_le(64)
|
||||
|
||||
#define emit_put_unaligned_le(nr) \
|
||||
static inline void put_unaligned_le##nr(u##nr val, void *buf) \
|
||||
{ \
|
||||
__le##nr x = cpu_to_le##nr(val); \
|
||||
memcpy(buf, &x, sizeof(x)); \
|
||||
}
|
||||
emit_put_unaligned_le(16)
|
||||
emit_put_unaligned_le(32)
|
||||
emit_put_unaligned_le(64)
|
||||
|
||||
/*
|
||||
* return -1,0,+1 based on the memcmp comparison of the minimum of their
|
||||
* two lengths. If their min shared bytes are equal but the lengths
|
||||
|
||||
Reference in New Issue
Block a user