Compare commits

..

7 Commits

Author SHA1 Message Date
Zach Brown 6541ccfdd0 Add test_bit to utils bitmap
Add test_bit() to the trivial utils bitmap.c implementation.

Signed-off-by: Zach Brown <zab@versity.com>
2024-03-06 17:18:57 -08:00
Zach Brown b56836b395 Add {read,write}-metadata-image scoutfs commands
Signed-off-by: Zach Brown <zab@versity.com>
2024-03-06 17:18:57 -08:00
Zach Brown de8395a9cf Fix partial rename to check_meta_alloc
As I was committing the initial check command I had only partially
completed a rename of the function that checks the metadata allocators.

Signed-off-by: Zach Brown <zab@versity.com>
2024-03-06 17:18:01 -08:00
Zach Brown 94c608f281 (wip) add check command 2024-03-04 15:13:46 -08:00
Zach Brown 84c1460e4f Fix printing alloc list block extents
The list alloc blocks have an array of blknos that are offset by a start
field in the block header.  The print code wasn't using that and was
always referencing the beginning of the array, which could miss blocks.

Signed-off-by: Zach Brown <zab@versity.com>
2024-03-04 15:13:46 -08:00
Zach Brown e407f67fcc Import a few more functions to our list.h
Import a few more functions from the kernel's list.h into our imported
copy.

Signed-off-by: Zach Brown <zab@versity.com>
2024-03-04 15:13:46 -08:00
Zach Brown 13c4b35bed Add lk rbtree wrapper
Import the kernel's rbtree implementation with a wrapper so we can use
it from userspace.

Signed-off-by: Zach Brown <zab@versity.com>
2024-03-04 15:13:46 -08:00
48 changed files with 2941 additions and 2970 deletions
+3 -9
View File
@@ -12,22 +12,17 @@ else
SP = @:
endif
SCOUTFS_GIT_DESCRIBE ?= \
SCOUTFS_GIT_DESCRIBE := \
$(shell git describe --all --abbrev=6 --long 2>/dev/null || \
echo no-git)
ESCAPED_GIT_DESCRIBE := \
$(shell echo $(SCOUTFS_GIT_DESCRIBE) |sed -e 's/\//\\\//g')
RPM_GITHASH ?= $(shell git rev-parse --short HEAD)
SCOUTFS_ARGS := SCOUTFS_GIT_DESCRIBE=$(SCOUTFS_GIT_DESCRIBE) \
RPM_GITHASH=$(RPM_GITHASH) \
CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(CURDIR)/src \
EXTRA_CFLAGS="-Werror"
# - We use the git describe from tags to set up the RPM versioning
RPM_VERSION := $(shell git describe --long --tags | awk -F '-' '{gsub(/^v/,""); print $$1}')
RPM_GITHASH := $(shell git rev-parse --short HEAD)
TARFILE = scoutfs-kmod-$(RPM_VERSION).tar
@@ -46,8 +41,7 @@ modules_install:
%.spec: %.spec.in .FORCE
sed -e 's/@@VERSION@@/$(RPM_VERSION)/g' \
-e 's/@@GITHASH@@/$(RPM_GITHASH)/g' \
-e 's/@@GITDESCRIBE@@/$(ESCAPED_GIT_DESCRIBE)/g' < $< > $@+
-e 's/@@GITHASH@@/$(RPM_GITHASH)/g' < $< > $@+
mv $@+ $@
+2 -14
View File
@@ -1,7 +1,6 @@
%define kmod_name scoutfs
%define kmod_version @@VERSION@@
%define kmod_git_hash @@GITHASH@@
%define kmod_git_describe @@GITDESCRIBE@@
%define pkg_date %(date +%%Y%%m%%d)
# Disable the building of the debug package(s).
@@ -76,7 +75,7 @@ echo "Building for kernel: %{kernel_version} flavors: '%{flavors_to_build}'"
for flavor in %flavors_to_build; do
rm -rf obj/$flavor
cp -r source obj/$flavor
make RPM_GITHASH=%{kmod_git_hash} SCOUTFS_GIT_DESCRIBE=%{kmod_git_describe} SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
make SK_KSRC=%{kernel_source $flavor} -C obj/$flavor module
done
%install
@@ -98,21 +97,10 @@ find %{buildroot} -type f -name \*.ko -exec %{__chmod} u+x \{\} \;
/lib/modules
%post
echo /lib/modules/%{kversion}/%{install_mod_dir}/scoutfs.ko | weak-modules --add-modules --no-initramfs
weak-modules --add-kernel --no-initramfs
depmod -a
%endif
%clean
rm -rf %{buildroot}
%preun
# stash our modules for postun cleanup
SCOUTFS_RPM_NAME=$(rpm -q %{name} | grep "%{version}-%{release}")
rpm -ql $SCOUTFS_RPM_NAME | grep '\.ko$' > /var/run/%{name}-modules-%{version}-%{release} || true
%postun
if [ -x /sbin/weak-modules ]; then
cat /var/run/%{name}-modules-%{version}-%{release} | /sbin/weak-modules --remove-modules --no-initramfs
fi
rm /var/run/%{name}-modules-%{version}-%{release} || true
+2 -6
View File
@@ -12,8 +12,7 @@ BIN := src/createmany \
src/find_xattrs \
src/create_xattr_loop \
src/fragmented_data_extents \
src/o_tmpfile_umask \
src/parallel_restore
src/o_tmpfile_umask
DEPS := $(wildcard src/*.d)
@@ -23,11 +22,8 @@ ifneq ($(DEPS),)
-include $(DEPS)
endif
src/parallel_restore_cflags := ../utils/src/scoutfs_parallel_restore.a -lm
$(BIN): %: %.c Makefile
gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@ $($(@)_cflags)
gcc $(CFLAGS) -MD -MP -MF $*.d $< -o $@
.PHONY: clean
clean:
-782
View File
@@ -1,782 +0,0 @@
#define _GNU_SOURCE /* O_DIRECT */
#include <stdlib.h>
#include <stdio.h>
#include <stdbool.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/xattr.h>
#include <ctype.h>
#include <string.h>
#include <errno.h>
#include <limits.h>
#include <time.h>
#include <sys/prctl.h>
#include <signal.h>
#include <sys/socket.h>
#include "../../utils/src/sparse.h"
#include "../../utils/src/util.h"
#include "../../utils/src/list.h"
#include "../../utils/src/parse.h"
#include "../../kmod/src/format.h"
#include "../../utils/src/parallel_restore.h"
/*
* XXX:
* - add a nice description of what's going on
* - mention allocator contention
* - test child process dying handling
* - root dir entry name length is wrong
*/
#define ERRF " errno %d (%s)"
#define ERRA errno, strerror(errno)
#define error_exit(cond, fmt, args...) \
do { \
if (cond) { \
printf("error: "fmt"\n", ##args); \
exit(1); \
} \
} while (0)
#define dprintf(fmt, args...) \
do { \
if (0) \
printf(fmt, ##args); \
} while (0)
#define REG_MODE (S_IFREG | 0644)
#define DIR_MODE (S_IFDIR | 0755)
struct opts {
unsigned long long buf_size;
unsigned long long write_batch;
unsigned long long low_dirs;
unsigned long long high_dirs;
unsigned long long low_files;
unsigned long long high_files;
char *meta_path;
unsigned long long total_files;
bool read_only;
unsigned long long seed;
unsigned long long nr_writers;
};
static void usage(void)
{
printf("usage:\n"
" -b NR | threads write blocks in batches files (100000)\n"
" -d LOW:HIGH | range of subdirs per directory (5:10)\n"
" -f LOW:HIGH | range of files per directory (10:20)\n"
" -m PATH | path to metadata device\n"
" -n NR | total number of files to create (100)\n"
" -r | read-only, all work except writing, measure cpu cost\n"
" -s NR | randomization seed (random)\n"
" -w NR | number of writing processes to fork (online cpus)\n"
);
}
static size_t write_bufs(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
void *buf, size_t buf_size, int dev_fd)
{
size_t total = 0;
size_t count;
off_t off;
int ret;
do {
ret = scoutfs_parallel_restore_write_buf(wri, buf, buf_size, &off, &count);
error_exit(ret, "write buf %d", ret);
if (count > 0) {
if (!opts->read_only)
ret = pwrite(dev_fd, buf, count, off);
else
ret = count;
error_exit(ret != count, "pwrite count %zu ret %d", count, ret);
total += ret;
}
} while (count > 0);
return total;
}
struct gen_inode {
struct scoutfs_parallel_restore_inode inode;
struct scoutfs_parallel_restore_xattr **xattrs;
u64 nr_xattrs;
struct scoutfs_parallel_restore_entry **entries;
u64 nr_files;
u64 nr_entries;
};
static void free_gino(struct gen_inode *gino)
{
u64 i;
if (gino) {
if (gino->entries) {
for (i = 0; i < gino->nr_entries; i++)
free(gino->entries[i]);
free(gino->entries);
}
if (gino->xattrs) {
for (i = 0; i < gino->nr_xattrs; i++)
free(gino->xattrs[i]);
free(gino->xattrs);
}
free(gino);
}
}
static struct scoutfs_parallel_restore_xattr *
generate_xattr(struct opts *opts, u64 ino, u64 pos, char *name, int name_len, void *value,
int value_len)
{
struct scoutfs_parallel_restore_xattr *xattr;
xattr = malloc(sizeof(struct scoutfs_parallel_restore_xattr) + name_len + value_len);
error_exit(!xattr, "error allocating generated xattr");
*xattr = (struct scoutfs_parallel_restore_xattr) {
.ino = ino,
.pos = pos,
.name_len = name_len,
.value_len = value_len,
};
xattr->name = (void *)(xattr + 1);
xattr->value = (void *)(xattr->name + name_len);
memcpy(xattr->name, name, name_len);
if (value_len)
memcpy(xattr->value, value, value_len);
return xattr;
}
static struct gen_inode *generate_inode(struct opts *opts, u64 ino, mode_t mode)
{
struct gen_inode *gino;
struct timespec now;
clock_gettime(CLOCK_REALTIME, &now);
gino = calloc(1, sizeof(struct gen_inode));
error_exit(!gino, "failure allocating generated inode");
gino->inode = (struct scoutfs_parallel_restore_inode) {
.ino = ino,
.mode = mode,
.atime = now,
.ctime = now,
.mtime = now,
.crtime = now,
};
/*
* hacky creation of a bunch of xattrs for now.
*/
if ((mode & S_IFMT) == S_IFREG) {
#define NV(n, v) { n, sizeof(n) - 1, v, sizeof(v) - 1, }
struct name_val {
char *name;
int len;
char *value;
int value_len;
} nv[] = {
NV("scoutfs.hide.totl.acct.8314611887310466424.2.0", "1"),
NV("scoutfs.hide.srch.sam_vol_E01001L6_4", ""),
NV("scoutfs.hide.sam_reqcopies", ""),
NV("scoutfs.hide.sam_copy_2", ""),
NV("scoutfs.hide.totl.acct.F01030L6.8314611887310466424.7.30", "1"),
NV("scoutfs.hide.sam_copy_1", ""),
NV("scoutfs.hide.srch.sam_vol_F01030L6_4", ""),
NV("scoutfs.hide.srch.sam_release_cand", ""),
NV("scoutfs.hide.sam_restime", ""),
NV("scoutfs.hide.sam_uuid", ""),
NV("scoutfs.hide.totl.acct.8314611887310466424.3.0", "1"),
NV("scoutfs.hide.srch.sam_vol_F01030L6", ""),
NV("scoutfs.hide.srch.sam_uuid_865939b7-24d6-472f-b85c-7ce7afeb813a", ""),
NV("scoutfs.hide.srch.sam_vol_E01001L6", ""),
NV("scoutfs.hide.totl.acct.E01001L6.8314611887310466424.7.1", "1"),
NV("scoutfs.hide.totl.acct.8314611887310466424.4.0", "1"),
NV("scoutfs.hide.totl.acct.8314611887310466424.11.0", "1"),
NV("scoutfs.hide.totl.acct.8314611887310466424.1.0", "1"),
};
unsigned int nr = array_size(nv);
int i;
gino->xattrs = calloc(nr, sizeof(struct scoutfs_parallel_restore_xattr *));
for (i = 0; i < nr; i++)
gino->xattrs[i] = generate_xattr(opts, ino, i, nv[i].name, nv[i].len,
nv[i].value, nv[i].value_len);
gino->nr_xattrs = nr;
gino->inode.nr_xattrs = nr;
}
return gino;
}
static struct scoutfs_parallel_restore_entry *
generate_entry(struct opts *opts, char *prefix, u64 nr, u64 dir_ino, u64 pos, u64 ino, mode_t mode)
{
struct scoutfs_parallel_restore_entry *entry;
char buf[PATH_MAX];
int bytes;
bytes = snprintf(buf, sizeof(buf), "%s-%llu", prefix, nr);
entry = malloc(sizeof(struct scoutfs_parallel_restore_entry) + bytes);
error_exit(!entry, "error allocating generated entry");
*entry = (struct scoutfs_parallel_restore_entry) {
.dir_ino = dir_ino,
.pos = pos,
.ino = ino,
.mode = mode,
.name = (void *)(entry + 1),
.name_len = bytes,
};
memcpy(entry->name, buf, bytes);
return entry;
}
static u64 random64(void)
{
return ((u64)lrand48() << 32) | lrand48();
}
static u64 random_range(u64 low, u64 high)
{
return low + (random64() % (high - low + 1));
}
static struct gen_inode *generate_dir(struct opts *opts, u64 dir_ino, u64 ino_start, u64 ino_len,
bool no_dirs)
{
struct scoutfs_parallel_restore_entry *entry;
struct gen_inode *gino;
u64 nr_entries;
u64 nr_files;
u64 nr_dirs;
u64 ino;
char *prefix;
mode_t mode;
u64 i;
nr_dirs = no_dirs ? 0 : random_range(opts->low_dirs, opts->high_dirs);
nr_files = random_range(opts->low_files, opts->high_files);
if (1 + nr_dirs + nr_files > ino_len) {
nr_dirs = no_dirs ? 0 : (ino_len - 1) / 2;
nr_files = (ino_len - 1) - nr_dirs;
}
nr_entries = nr_dirs + nr_files;
gino = generate_inode(opts, dir_ino, DIR_MODE);
error_exit(!gino, "error allocating generated inode");
gino->inode.nr_subdirs = nr_dirs;
gino->nr_files = nr_files;
if (nr_entries) {
gino->entries = calloc(nr_entries, sizeof(struct scoutfs_parallel_restore_entry *));
error_exit(!gino->entries, "error allocating generated inode entries");
gino->nr_entries = nr_entries;
}
mode = DIR_MODE;
prefix = "dir";
for (i = 0; i < nr_entries; i++) {
if (i == nr_dirs) {
mode = REG_MODE;
prefix = "file";
}
ino = ino_start + i;
entry = generate_entry(opts, prefix, ino, gino->inode.ino,
SCOUTFS_DIRENT_FIRST_POS + i, ino, mode);
gino->entries[i] = entry;
gino->inode.total_entry_name_bytes += entry->name_len;
}
return gino;
}
/*
* Restore a generated inode. If it's a directory then we also restore
* all its entries. The caller is going to descend into subdir entries and generate
* those dir inodes. We have to generate and restore all non-dir inodes referenced
* by this inode's entries.
*/
static void restore_inode(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
struct gen_inode *gino)
{
struct gen_inode *nondir;
int ret;
u64 i;
ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
error_exit(ret, "thread add root inode %d", ret);
for (i = 0; i < gino->nr_entries; i++) {
ret = scoutfs_parallel_restore_add_entry(wri, gino->entries[i]);
error_exit(ret, "thread add entry %d", ret);
/* caller only needs subdir entries, generate and free others */
if ((gino->entries[i]->mode & S_IFMT) != S_IFDIR) {
nondir = generate_inode(opts, gino->entries[i]->ino,
gino->entries[i]->mode);
restore_inode(opts, wri, nondir);
free_gino(nondir);
free(gino->entries[i]);
if (i != gino->nr_entries - 1)
gino->entries[i] = gino->entries[gino->nr_entries - 1];
gino->nr_entries--;
gino->nr_files--;
i--;
}
}
for (i = 0; i < gino->nr_xattrs; i++) {
ret = scoutfs_parallel_restore_add_xattr(wri, gino->xattrs[i]);
error_exit(ret, "thread add xattr %d", ret);
}
}
struct writer_args {
struct list_head head;
int dev_fd;
int pair_fd;
struct scoutfs_parallel_restore_slice slice;
u64 writer_nr;
u64 dir_height;
u64 ino_start;
u64 ino_len;
};
struct write_result {
struct scoutfs_parallel_restore_progress prog;
struct scoutfs_parallel_restore_slice slice;
__le64 files_created;
__le64 bytes_written;
};
static void write_bufs_and_send(struct opts *opts, struct scoutfs_parallel_restore_writer *wri,
void *buf, size_t buf_size, int dev_fd,
struct write_result *res, bool get_slice, int pair_fd)
{
size_t total;
int ret;
total = write_bufs(opts, wri, buf, buf_size, dev_fd);
le64_add_cpu(&res->bytes_written, total);
ret = scoutfs_parallel_restore_get_progress(wri, &res->prog);
error_exit(ret, "get prog %d", ret);
if (get_slice) {
ret = scoutfs_parallel_restore_get_slice(wri, &res->slice);
error_exit(ret, "thread get slice %d", ret);
}
ret = write(pair_fd, res, sizeof(struct write_result));
error_exit(ret != sizeof(struct write_result), "result send error");
memset(res, 0, sizeof(struct write_result));
}
/*
* Calculate the number of bytes in toplevel "dir-%llu" entry names for the given
* number of writers.
*/
static u64 topdir_entry_bytes(u64 nr_writers)
{
u64 bytes = (3 + 1) * nr_writers;
u64 limit;
u64 done;
u64 wid;
u64 nr;
for (done = 0, wid = 1, limit = 10; done < nr_writers; done += nr, wid++, limit *= 10) {
nr = min(limit - done, nr_writers - done);
bytes += nr * wid;
}
return bytes;
}
struct dir_pos {
struct gen_inode *gino;
u64 pos;
};
static void writer_proc(struct opts *opts, struct writer_args *args)
{
struct scoutfs_parallel_restore_writer *wri = NULL;
struct scoutfs_parallel_restore_entry *entry;
struct dir_pos *dirs = NULL;
struct write_result res;
struct gen_inode *gino;
void *buf = NULL;
u64 level;
u64 ino;
int ret;
memset(&res, 0, sizeof(res));
dirs = calloc(args->dir_height, sizeof(struct dir_pos));
error_exit(errno, "error allocating parent dirs "ERRF, ERRA);
errno = posix_memalign((void **)&buf, 4096, opts->buf_size);
error_exit(errno, "error allocating block buf "ERRF, ERRA);
ret = scoutfs_parallel_restore_create_writer(&wri);
error_exit(ret, "create writer %d", ret);
ret = scoutfs_parallel_restore_add_slice(wri, &args->slice);
error_exit(ret, "add slice %d", ret);
/* writer 0 creates the root dir */
if (args->writer_nr == 0) {
gino = generate_inode(opts, SCOUTFS_ROOT_INO, DIR_MODE);
gino->inode.nr_subdirs = opts->nr_writers;
gino->inode.total_entry_name_bytes = topdir_entry_bytes(opts->nr_writers);
ret = scoutfs_parallel_restore_add_inode(wri, &gino->inode);
error_exit(ret, "thread add root inode %d", ret);
free_gino(gino);
}
/* create root entry for our top level dir */
ino = args->ino_start++;
args->ino_len--;
entry = generate_entry(opts, "top", args->writer_nr,
SCOUTFS_ROOT_INO, SCOUTFS_DIRENT_FIRST_POS + args->writer_nr,
ino, DIR_MODE);
ret = scoutfs_parallel_restore_add_entry(wri, entry);
error_exit(ret, "thread top entry %d", ret);
free(entry);
level = args->dir_height - 1;
while (args->ino_len > 0 && level < args->dir_height) {
gino = dirs[level].gino;
/* generate and restore if we follow entries */
if (!gino) {
gino = generate_dir(opts, ino, args->ino_start, args->ino_len, level == 0);
args->ino_start += gino->nr_entries;
args->ino_len -= gino->nr_entries;
le64_add_cpu(&res.files_created, gino->nr_files);
restore_inode(opts, wri, gino);
dirs[level].gino = gino;
}
if (dirs[level].pos == gino->nr_entries) {
/* ascend if we're done with this dir */
dirs[level].gino = NULL;
dirs[level].pos = 0;
free_gino(gino);
level++;
} else {
/* otherwise descend into subdir entry */
ino = gino->entries[dirs[level].pos]->ino;
dirs[level].pos++;
level--;
}
/* do a partial write at batch intervals when there's still more to do */
if (le64_to_cpu(res.files_created) >= opts->write_batch && args->ino_len > 0)
write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
&res, false, args->pair_fd);
}
write_bufs_and_send(opts, wri, buf, opts->buf_size, args->dev_fd,
&res, true, args->pair_fd);
scoutfs_parallel_restore_destroy_writer(&wri);
free(dirs);
free(buf);
}
static void fork_writer(struct opts *opts, struct writer_args *args)
{
pid_t parent = getpid();
pid_t pid;
int ret;
pid = fork();
error_exit(pid == -1, "fork error");
if (pid != 0)
return;
ret = prctl(PR_SET_PDEATHSIG, SIGHUP);
error_exit(ret < 0, "failed to set parent death sig");
printf("pid %u getpid() %u parent %u getppid() %u\n",
pid, getpid(), parent, getppid());
error_exit(getppid() != parent, "child parent already changed");
writer_proc(opts, args);
exit(0);
}
static int do_restore(struct opts *opts)
{
struct scoutfs_parallel_restore_writer *wri = NULL;
struct scoutfs_parallel_restore_slice *slices = NULL;
struct scoutfs_super_block *super = NULL;
struct write_result res;
struct writer_args *args;
struct timespec begin;
struct timespec end;
LIST_HEAD(writers);
u64 next_ino;
u64 ino_per;
u64 avg_dirs;
u64 avg_files;
u64 dir_height;
u64 tot_files;
u64 tot_bytes;
int pair[2] = {-1, -1};
float secs;
void *buf = NULL;
int dev_fd = -1;
int ret;
int i;
ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, pair);
error_exit(ret, "socketpair error "ERRF, ERRA);
dev_fd = open(opts->meta_path, O_DIRECT | (opts->read_only ? O_RDONLY : (O_RDWR|O_EXCL)));
error_exit(dev_fd < 0, "error opening '%s': "ERRF, opts->meta_path, ERRA);
errno = posix_memalign((void **)&super, 4096, SCOUTFS_BLOCK_SM_SIZE) ?:
posix_memalign((void **)&buf, 4096, opts->buf_size);
error_exit(errno, "error allocating block bufs "ERRF, ERRA);
ret = pread(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error reading super, ret %d", ret);
ret = scoutfs_parallel_restore_create_writer(&wri);
error_exit(ret, "create writer %d", ret);
ret = scoutfs_parallel_restore_import_super(wri, super);
error_exit(ret, "import super %d", ret);
slices = calloc(1 + opts->nr_writers, sizeof(struct scoutfs_parallel_restore_slice));
error_exit(!slices, "alloc slices");
scoutfs_parallel_restore_init_slices(wri, slices, 1 + opts->nr_writers);
ret = scoutfs_parallel_restore_add_slice(wri, &slices[0]);
error_exit(ret, "add slices[0] %d", ret);
next_ino = (SCOUTFS_ROOT_INO | SCOUTFS_LOCK_INODE_GROUP_MASK) + 1;
ino_per = opts->total_files / opts->nr_writers;
avg_dirs = (opts->low_dirs + opts->high_dirs) / 2;
avg_files = (opts->low_files + opts->high_files) / 2;
dir_height = 1;
tot_files = avg_files * opts->nr_writers;
while (tot_files < opts->total_files) {
dir_height++;
tot_files *= avg_dirs;
}
dprintf("height %llu tot %llu total %llu\n", dir_height, tot_files, opts->total_files);
clock_gettime(CLOCK_MONOTONIC_RAW, &begin);
/* start each writing process */
for (i = 0; i < opts->nr_writers; i++) {
args = calloc(1, sizeof(struct writer_args));
error_exit(!args, "alloc writer args");
args->dev_fd = dev_fd;
args->pair_fd = pair[1];
args->slice = slices[1 + i];
args->writer_nr = i;
args->dir_height = dir_height;
args->ino_start = next_ino;
args->ino_len = ino_per;
list_add_tail(&args->head, &writers);
next_ino += ino_per;
fork_writer(opts, args);
}
/* read results and watch for writers to finish */
tot_files = 0;
tot_bytes = 0;
i = 0;
while (i < opts->nr_writers) {
ret = read(pair[0], &res, sizeof(struct write_result));
error_exit(ret != sizeof(struct write_result), "result read error");
ret = scoutfs_parallel_restore_add_progress(wri, &res.prog);
error_exit(ret, "add thr prog %d", ret);
if (res.slice.meta_len != 0) {
ret = scoutfs_parallel_restore_add_slice(wri, &res.slice);
error_exit(ret, "add thr slice %d", ret);
i++;
}
tot_files += le64_to_cpu(res.files_created);
tot_bytes += le64_to_cpu(res.bytes_written);
}
tot_bytes += write_bufs(opts, wri, buf, opts->buf_size, dev_fd);
ret = scoutfs_parallel_restore_export_super(wri, super);
error_exit(ret, "update super %d", ret);
if (!opts->read_only) {
ret = pwrite(dev_fd, super, SCOUTFS_BLOCK_SM_SIZE,
SCOUTFS_SUPER_BLKNO << SCOUTFS_BLOCK_SM_SHIFT);
error_exit(ret != SCOUTFS_BLOCK_SM_SIZE, "error writing super, ret %d", ret);
}
clock_gettime(CLOCK_MONOTONIC_RAW, &end);
scoutfs_parallel_restore_destroy_writer(&wri);
secs = ((float)end.tv_sec + ((float)end.tv_nsec/NSEC_PER_SEC)) -
((float)begin.tv_sec + ((float)begin.tv_nsec/NSEC_PER_SEC));
printf("created %llu files in %llu bytes and %f secs => %f bytes/file, %f files/sec\n",
tot_files, tot_bytes, secs,
(float)tot_bytes / tot_files, (float)tot_files / secs);
if (dev_fd >= 0)
close(dev_fd);
if (pair[0] >= 0)
close(pair[0]);
if (pair[1] >= 0)
close(pair[1]);
free(super);
free(slices);
free(buf);
return 0;
}
static int parse_low_high(char *str, u64 *low_ret, u64 *high_ret)
{
char *sep;
int ret = 0;
sep = index(str, ':');
if (sep) {
*sep = '\0';
ret = parse_u64(sep + 1, high_ret);
}
if (ret == 0)
ret = parse_u64(str, low_ret);
if (sep)
*sep = ':';
return ret;
}
int main(int argc, char **argv)
{
struct opts opts = {
.buf_size = (32 * 1024 * 1024),
.write_batch = 1000000,
.low_dirs = 5,
.high_dirs = 10,
.low_files = 10,
.high_files = 20,
.total_files = 100,
};
int ret;
int c;
opts.seed = random64();
opts.nr_writers = sysconf(_SC_NPROCESSORS_ONLN);
while ((c = getopt(argc, argv, "b:d:f:m:n:rs:w:")) != -1) {
switch(c) {
case 'b':
ret = parse_u64(optarg, &opts.write_batch);
error_exit(ret, "error parsing -b '%s'\n", optarg);
error_exit(opts.write_batch == 0, "-b can't be 0");
break;
case 'd':
ret = parse_low_high(optarg, &opts.low_dirs, &opts.high_dirs);
error_exit(ret, "error parsing -d '%s'\n", optarg);
break;
case 'f':
ret = parse_low_high(optarg, &opts.low_files, &opts.high_files);
error_exit(ret, "error parsing -f '%s'\n", optarg);
break;
case 'm':
opts.meta_path = strdup(optarg);
break;
case 'n':
ret = parse_u64(optarg, &opts.total_files);
error_exit(ret, "error parsing -n '%s'\n", optarg);
break;
case 'r':
opts.read_only = true;
break;
case 's':
ret = parse_u64(optarg, &opts.seed);
error_exit(ret, "error parsing -s '%s'\n", optarg);
break;
case 'w':
ret = parse_u64(optarg, &opts.nr_writers);
error_exit(ret, "error parsing -w '%s'\n", optarg);
break;
case '?':
printf("Unknown option '%c'\n", optopt);
usage();
exit(1);
}
}
error_exit(opts.low_dirs > opts.high_dirs, "LOW > HIGH in -d %llu:%llu",
opts.low_dirs, opts.high_dirs);
error_exit(opts.low_files > opts.high_files, "LOW > HIGH in -f %llu:%llu",
opts.low_files, opts.high_files);
error_exit(!opts.meta_path, "must specify metadata device path with -m");
printf("recreate with: -d %llu:%llu -f %llu:%llu -n %llu -s %llu -w %llu\n",
opts.low_dirs, opts.high_dirs, opts.low_files, opts.high_files,
opts.total_files, opts.seed, opts.nr_writers);
ret = do_restore(&opts);
free(opts.meta_path);
return ret == 0 ? 0 : 1;
}
+5 -10
View File
@@ -7,7 +7,7 @@ FMTIOC_H := format.h ioctl.h
FMTIOC_KMOD := $(addprefix ../kmod/src/,$(FMTIOC_H))
CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -g -msse4.2 \
-fno-strict-aliasing \
-I src/ -fno-strict-aliasing \
-DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU
ifneq ($(wildcard $(firstword $(FMTIOC_KMOD))),)
@@ -15,12 +15,11 @@ CFLAGS += -I../kmod/src
endif
BIN := src/scoutfs
OBJ := $(patsubst %.c,%.o,$(wildcard src/*.c))
DEPS := $(wildcard */*.d)
OBJ_DIRS := src src/check
OBJ := $(foreach dir,$(OBJ_DIRS),$(patsubst %.c,%.o,$(wildcard $(dir)/*.c)))
DEPS := $(foreach dir,$(OBJ_DIRS),$(wildcard $(dir)/*.d))
AR := src/scoutfs_parallel_restore.a
all: $(BIN) $(AR)
all: $(BIN)
ifneq ($(DEPS),)
-include $(DEPS)
@@ -38,10 +37,6 @@ $(BIN): $(OBJ)
$(QU) [BIN $@]
$(VE)gcc -o $@ $^ -luuid -lm -lcrypto -lblkid
$(AR): $(OBJ)
$(QU) [AR $@]
$(VE)ar rcs $@ $^
%.o %.d: %.c Makefile sparse.sh
$(QU) [CC $<]
$(VE)gcc $(CFLAGS) -MD -MP -MF $*.d -c $< -o $*.o
-3
View File
@@ -54,8 +54,6 @@ cp man/*.8.gz $RPM_BUILD_ROOT%{_mandir}/man8/.
install -m 755 -D src/scoutfs $RPM_BUILD_ROOT%{_sbindir}/scoutfs
install -m 644 -D src/ioctl.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/ioctl.h
install -m 644 -D src/format.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/format.h
install -m 644 -D src/parallel_restore.h $RPM_BUILD_ROOT%{_includedir}/scoutfs/parallel_restore.h
install -m 644 -D src/scoutfs_parallel_restore.a $RPM_BUILD_ROOT%{_libdir}/scoutfs/libscoutfs_parallel_restore.a
install -m 755 -D fenced/scoutfs-fenced $RPM_BUILD_ROOT%{_libexecdir}/scoutfs-fenced/scoutfs-fenced
install -m 644 -D fenced/scoutfs-fenced.service $RPM_BUILD_ROOT%{_unitdir}/scoutfs-fenced.service
install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/scoutfs/scoutfs-fenced.conf.example
@@ -72,7 +70,6 @@ install -m 644 -D fenced/scoutfs-fenced.conf.example $RPM_BUILD_ROOT%{_sysconfdi
%files -n scoutfs-devel
%defattr(644,root,root,755)
%{_includedir}/scoutfs
%{_libdir}/scoutfs
%clean
rm -rf %{buildroot}
+5
View File
@@ -10,6 +10,11 @@
* Just a quick simple native bitmap.
*/
int test_bit(unsigned long *bits, u64 nr)
{
return !!(bits[nr / BITS_PER_LONG] & (1UL << (nr & (BITS_PER_LONG - 1))));
}
void set_bit(unsigned long *bits, u64 nr)
{
bits[nr / BITS_PER_LONG] |= 1UL << (nr & (BITS_PER_LONG - 1));
+1
View File
@@ -1,6 +1,7 @@
#ifndef _BITMAP_H_
#define _BITMAP_H_
int test_bit(unsigned long *bits, u64 nr);
void set_bit(unsigned long *bits, u64 nr);
void clear_bit(unsigned long *bits, u64 nr);
u64 find_next_set_bit(unsigned long *start, u64 from, u64 total);
-20
View File
@@ -1,20 +0,0 @@
#include <errno.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "hash.h"
#include "bloom.h"
void calc_bloom_nrs(struct scoutfs_key *key, unsigned int *nrs)
{
u64 hash;
int i;
hash = scoutfs_hash64(key, sizeof(struct scoutfs_key));
for (i = 0; i < SCOUTFS_FOREST_BLOOM_NRS; i++) {
nrs[i] = (u32)hash % SCOUTFS_FOREST_BLOOM_BITS;
hash >>= SCOUTFS_FOREST_BLOOM_FUNC_BITS;
}
}
-6
View File
@@ -1,6 +0,0 @@
#ifndef _BLOOM_H_
#define _BLOOM_H_
void calc_bloom_nrs(struct scoutfs_key *key, unsigned int *nrs);
#endif
+2 -2
View File
@@ -8,7 +8,7 @@
#include "leaf_item_hash.h"
#include "btree.h"
void btree_init_block(struct scoutfs_btree_block *bt, int level)
static void init_block(struct scoutfs_btree_block *bt, int level)
{
int free;
@@ -33,7 +33,7 @@ void btree_init_root_single(struct scoutfs_btree_root *root,
memset(bt, 0, SCOUTFS_BLOCK_LG_SIZE);
btree_init_block(bt, 0);
init_block(bt, 0);
}
static void *alloc_val(struct scoutfs_btree_block *bt, int len)
-1
View File
@@ -1,7 +1,6 @@
#ifndef _BTREE_H_
#define _BTREE_H_
void btree_init_block(struct scoutfs_btree_block *bt, int level);
void btree_init_root_single(struct scoutfs_btree_root *root,
struct scoutfs_btree_block *bt,
u64 seq, u64 blkno);
+159
View File
@@ -0,0 +1,159 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <sys/mman.h>
#include <errno.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "bitmap.h"
#include "key.h"
#include "alloc.h"
#include "block.h"
#include "btree.h"
#include "extent.h"
#include "iter.h"
#include "sns.h"
/*
* We check the list blocks serially.
*
* XXX:
* - compare ref seqs
* - detect cycles?
*/
int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
{
struct scoutfs_alloc_list_block *lblk;
struct scoutfs_block_ref ref;
struct block *blk = NULL;
u64 blkno;
int ret;
ref = lhead->ref;
while (ref.blkno) {
blkno = le64_to_cpu(ref.blkno);
ret = cb(blkno, 1, cb_arg);
if (ret < 0) {
ret = xlate_iter_errno(ret);
goto out;
}
ret = block_get(&blk, blkno, 0);
if (ret < 0)
goto out;
lblk = block_buf(blk);
/* XXX verify block */
/* XXX sort? maybe */
ref = lblk->next;
block_put(&blk);
}
ret = 0;
out:
return ret;
}
int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
{
return btree_meta_iter(&root->root, cb, cb_arg);
}
int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg)
{
struct scoutfs_alloc_list_block *lblk;
struct scoutfs_block_ref ref;
struct block *blk = NULL;
u64 blkno;
int ret;
int i;
ref = lhead->ref;
while (ref.blkno) {
blkno = le64_to_cpu(ref.blkno);
ret = block_get(&blk, blkno, 0);
if (ret < 0)
goto out;
sns_push("alloc_list_block", blkno, 0);
lblk = block_buf(blk);
/* XXX verify block */
/* XXX sort? maybe */
ret = 0;
for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
blkno = le64_to_cpu(lblk->blknos[le32_to_cpu(lblk->start) + i]);
ret = cb(blkno, 1, cb_arg);
if (ret < 0)
break;
}
ref = lblk->next;
block_put(&blk);
sns_pop();
if (ret < 0) {
ret = xlate_iter_errno(ret);
goto out;
}
}
ret = 0;
out:
return ret;
}
static bool valid_free_extent_key(struct scoutfs_key *key)
{
return (key->sk_zone == SCOUTFS_FREE_EXTENT_BLKNO_ZONE ||
key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE) &&
(!key->_sk_fourth && !key->sk_type &&
(key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE || !key->_sk_third));
}
static int free_item_cb(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
{
struct extent_cb_arg_t *ecba = cb_arg;
u64 start;
u64 len;
/* XXX not sure these eios are what we want */
if (val_len != 0)
return -EIO;
if (!valid_free_extent_key(key))
return -EIO;
if (key->sk_zone == SCOUTFS_FREE_EXTENT_ORDER_ZONE)
return -ECHECK_ITER_DONE;
start = le64_to_cpu(key->skfb_end) - le64_to_cpu(key->skfb_len) + 1;
len = le64_to_cpu(key->skfb_len);
return ecba->cb(start, len, ecba->cb_arg);
}
/*
* Call the callback with each of the primary BLKNO free extents stored
* in item in the given alloc root. It doesn't visit the secondary
* ORDER extents.
*/
int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg)
{
struct extent_cb_arg_t ecba = { .cb = cb, .cb_arg = cb_arg };
return btree_item_iter(&root->root, free_item_cb, &ecba);
}
+12
View File
@@ -0,0 +1,12 @@
#ifndef _SCOUTFS_UTILS_CHECK_ALLOC_H
#define _SCOUTFS_UTILS_CHECK_ALLOC_H
#include "extent.h"
int alloc_list_meta_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
int alloc_root_meta_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
int alloc_list_extent_iter(struct scoutfs_alloc_list_head *lhead, extent_cb_t cb, void *cb_arg);
int alloc_root_extent_iter(struct scoutfs_alloc_root *root, extent_cb_t cb, void *cb_arg);
#endif
+564
View File
@@ -0,0 +1,564 @@
#define _ISOC11_SOURCE /* aligned_alloc */
#define _DEFAULT_SOURCE /* syscall() */
#include <stdlib.h>
#include <unistd.h>
#include <stdbool.h>
#include <stdio.h>
#include <errno.h>
#include <sys/syscall.h>
#include <linux/aio_abi.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "list.h"
#include "cmp.h"
#include "hash.h"
#include "block.h"
#include "debug.h"
#include "eno.h"
static struct block_data {
struct list_head *hash_lists;
size_t hash_nr;
struct list_head active_head;
struct list_head inactive_head;
struct list_head dirty_list;
size_t nr_active;
size_t nr_inactive;
size_t nr_dirty;
int meta_fd;
size_t max_cached;
size_t nr_events;
aio_context_t ctx;
struct iocb *iocbs;
struct iocb **iocbps;
struct io_event *events;
} global_bdat;
struct block {
struct list_head hash_head;
struct list_head lru_head;
struct list_head dirty_head;
struct list_head submit_head;
unsigned long refcount;
unsigned long uptodate:1,
active:1;
u64 blkno;
void *buf;
size_t size;
};
#define BLK_FMT \
"blkno %llu rc %ld d %u a %u"
#define BLK_ARG(blk) \
(blk)->blkno, (blk)->refcount, !list_empty(&(blk)->dirty_head), blk->active
#define debug_blk(blk, fmt, args...) \
debug(fmt " " BLK_FMT, ##args, BLK_ARG(blk))
/*
* This just allocates and initialzies the block. The caller is
* responsible for putting it on the appropriate initial lists and
* managing refcounts.
*/
static struct block *alloc_block(struct block_data *bdat, u64 blkno, size_t size)
{
struct block *blk;
blk = calloc(1, sizeof(struct block));
if (blk) {
blk->buf = aligned_alloc(4096, size); /* XXX static alignment :/ */
if (!blk->buf) {
free(blk);
blk = NULL;
} else {
INIT_LIST_HEAD(&blk->hash_head);
INIT_LIST_HEAD(&blk->lru_head);
INIT_LIST_HEAD(&blk->dirty_head);
INIT_LIST_HEAD(&blk->submit_head);
blk->blkno = blkno;
blk->size = size;
}
}
return blk;
}
static void free_block(struct block_data *bdat, struct block *blk)
{
debug_blk(blk, "free");
if (!list_empty(&blk->lru_head)) {
if (blk->active)
bdat->nr_active--;
else
bdat->nr_inactive--;
list_del(&blk->lru_head);
}
if (!list_empty(&blk->dirty_head)) {
bdat->nr_dirty--;
list_del(&blk->dirty_head);
}
if (!list_empty(&blk->hash_head))
list_del(&blk->hash_head);
if (!list_empty(&blk->submit_head))
list_del(&blk->submit_head);
free(blk->buf);
free(blk);
}
static bool blk_is_dirty(struct block *blk)
{
return !list_empty(&blk->dirty_head);
}
/*
* Rebalance the cache.
*
* First we shrink the cache to limit it to max_cached blocks.
* Logically, we walk from oldest to newest in the inactive list and
* then in the active list. Since these lists are physically one
* list_head list we achieve this with a reverse walk starting from the
* active head.
*
* Then we rebalnace the size of the two lists. The constraint is that
* we don't let the active list grow larger than the inactive list. We
* move blocks from the oldest tail of the active list to the newest
* head of the inactive list.
*
* <- [active head] <-> [ .. active list .. ] <-> [inactive head] <-> [ .. inactive list .. ] ->
*/
static void rebalance_cache(struct block_data *bdat)
{
struct block *blk;
struct block *blk_;
list_for_each_entry_safe_reverse(blk, blk_, &bdat->active_head, lru_head) {
if ((bdat->nr_active + bdat->nr_inactive) < bdat->max_cached)
break;
if (&blk->lru_head == &bdat->inactive_head || blk->refcount > 0 ||
blk_is_dirty(blk))
continue;
free_block(bdat, blk);
}
list_for_each_entry_safe_reverse(blk, blk_, &bdat->inactive_head, lru_head) {
if (bdat->nr_active <= bdat->nr_inactive || &blk->lru_head == &bdat->active_head)
break;
list_move(&blk->lru_head, &bdat->inactive_head);
blk->active = 0;
bdat->nr_active--;
bdat->nr_inactive++;
}
}
static void make_active(struct block_data *bdat, struct block *blk)
{
if (!blk->active) {
if (!list_empty(&blk->lru_head)) {
list_move(&blk->lru_head, &bdat->active_head);
bdat->nr_inactive--;
} else {
list_add(&blk->lru_head, &bdat->active_head);
}
blk->active = 1;
bdat->nr_active++;
}
}
static int compar_iocbp(const void *A, const void *B)
{
struct iocb *a = *(struct iocb **)A;
struct iocb *b = *(struct iocb **)B;
return scoutfs_cmp(a->aio_offset, b->aio_offset);
}
static int submit_and_wait(struct block_data *bdat, struct list_head *list)
{
struct io_event *event;
struct iocb *iocb;
struct block *blk;
int ret;
int err;
int nr;
int i;
err = 0;
nr = 0;
list_for_each_entry(blk, list, submit_head) {
iocb = &bdat->iocbs[nr];
bdat->iocbps[nr] = iocb;
memset(iocb, 0, sizeof(struct iocb));
iocb->aio_data = (intptr_t)blk;
iocb->aio_lio_opcode = blk_is_dirty(blk) ? IOCB_CMD_PWRITE : IOCB_CMD_PREAD;
iocb->aio_fildes = bdat->meta_fd;
iocb->aio_buf = (intptr_t)blk->buf;
iocb->aio_nbytes = blk->size;
iocb->aio_offset = blk->blkno * blk->size;
nr++;
debug_blk(blk, "submit");
if ((nr < bdat->nr_events) && blk->submit_head.next != list)
continue;
qsort(bdat->iocbps, nr, sizeof(bdat->iocbps[0]), compar_iocbp);
ret = syscall(__NR_io_submit, bdat->ctx, nr, bdat->iocbps);
if (ret != nr) {
if (ret >= 0)
errno = EIO;
ret = -errno;
printf("fatal system error submitting async IO: "ENO_FMT"\n",
ENO_ARG(-ret));
goto out;
}
ret = syscall(__NR_io_getevents, bdat->ctx, nr, nr, bdat->events, NULL);
if (ret != nr) {
if (ret >= 0)
errno = EIO;
ret = -errno;
printf("fatal system error getting IO events: "ENO_FMT"\n",
ENO_ARG(-ret));
goto out;
}
ret = 0;
for (i = 0; i < nr; i++) {
event = &bdat->events[i];
iocb = (struct iocb *)(intptr_t)event->obj;
blk = (struct block *)(intptr_t)event->data;
debug_blk(blk, "complete res %lld", (long long)event->res);
if (event->res >= 0 && event->res != blk->size)
event->res = -EIO;
/* io errors are fatal */
if (event->res < 0) {
ret = event->res;
goto out;
}
if (iocb->aio_lio_opcode == IOCB_CMD_PREAD) {
blk->uptodate = 1;
} else {
list_del_init(&blk->dirty_head);
bdat->nr_dirty--;
}
}
nr = 0;
}
ret = 0;
out:
return ret ?: err;
}
static void inc_refcount(struct block *blk)
{
blk->refcount++;
}
void block_put(struct block **blkp)
{
struct block_data *bdat = &global_bdat;
struct block *blk = *blkp;
if (blk) {
blk->refcount--;
*blkp = NULL;
rebalance_cache(bdat);
}
}
static struct list_head *hash_bucket(struct block_data *bdat, u64 blkno)
{
u32 hash = scoutfs_hash32(&blkno, sizeof(blkno));
return &bdat->hash_lists[hash % bdat->hash_nr];
}
static struct block *get_or_alloc(struct block_data *bdat, u64 blkno, int bf)
{
struct list_head *bucket = hash_bucket(bdat, blkno);
struct block *search;
struct block *blk;
size_t size;
size = (bf & BF_SM) ? SCOUTFS_BLOCK_SM_SIZE : SCOUTFS_BLOCK_LG_SIZE;
blk = NULL;
list_for_each_entry(search, bucket, hash_head) {
if (search->blkno && blkno && search->size == size) {
blk = search;
break;
}
}
if (!blk) {
blk = alloc_block(bdat, blkno, size);
if (blk) {
list_add(&blk->hash_head, bucket);
list_add(&blk->lru_head, &bdat->inactive_head);
bdat->nr_inactive++;
}
}
if (blk)
inc_refcount(blk);
return blk;
}
/*
* Get a block.
*
* The caller holds a refcount to the block while it's in use that
* prevents it from being removed from the cache. It must be dropped
* with block_put();
*/
int block_get(struct block **blk_ret, u64 blkno, int bf)
{
struct block_data *bdat = &global_bdat;
struct block *blk;
LIST_HEAD(list);
int ret;
blk = get_or_alloc(bdat, blkno, bf);
if (!blk) {
ret = -ENOMEM;
goto out;
}
if ((bf & BF_ZERO)) {
memset(blk->buf, 0, blk->size);
blk->uptodate = 1;
}
if (bf & BF_OVERWRITE)
blk->uptodate = 1;
if (!blk->uptodate) {
list_add(&blk->submit_head, &list);
ret = submit_and_wait(bdat, &list);
list_del_init(&blk->submit_head);
if (ret < 0)
goto out;
}
if ((bf & BF_DIRTY) && !blk_is_dirty(blk)) {
list_add_tail(&bdat->dirty_list, &blk->dirty_head);
bdat->nr_dirty++;
}
make_active(bdat, blk);
rebalance_cache(bdat);
ret = 0;
out:
if (ret < 0)
block_put(&blk);
*blk_ret = blk;
return ret;
}
void *block_buf(struct block *blk)
{
return blk->buf;
}
size_t block_size(struct block *blk)
{
return blk->size;
}
/*
* Drop the block from the cache, regardless of if it was free or not.
* This is used to avoid writing blocks which were dirtied but then
* later freed.
*
* The block is immediately freed and can't be referenced after this
* returns.
*/
void block_drop(struct block **blkp)
{
struct block_data *bdat = &global_bdat;
free_block(bdat, *blkp);
*blkp = NULL;
rebalance_cache(bdat);
}
/*
* This doesn't quite work for mixing large and small blocks, but that's
* fine, we never do that.
*/
static int compar_u64(const void *A, const void *B)
{
u64 a = *((u64 *)A);
u64 b = *((u64 *)B);
return scoutfs_cmp(a, b);
}
/*
* This read-ahead is synchronous and errors are ignored. If any of the
* blknos aren't present in the cache then we issue concurrent reads for
* them and wait. Any existing cached blocks will be left as is.
*
* We might be trying to read a lot more than the number of events so we
* sort the caller's blknos before iterating over them rather than
* relying on submission sorting the blocks in each submitted set.
*/
void block_readahead(u64 *blknos, size_t nr)
{
struct block_data *bdat = &global_bdat;
struct block *blk;
struct block *blk_;
LIST_HEAD(list);
size_t i;
if (nr == 0)
return;
qsort(blknos, nr, sizeof(blknos[0]), compar_u64);
for (i = 0; i < nr; i++) {
blk = get_or_alloc(bdat, blknos[i], 0);
if (blk) {
if (!blk->uptodate)
list_add_tail(&blk->submit_head, &list);
else
block_put(&blk);
}
}
(void)submit_and_wait(bdat, &list);
list_for_each_entry_safe(blk, blk_, &list, submit_head) {
list_del_init(&blk->submit_head);
block_put(&blk);
}
rebalance_cache(bdat);
}
/*
* The caller's block changes form a consistent transaction. If the amount of dirty
* blocks is large enough we issue a write.
*/
int block_try_commit(bool force)
{
struct block_data *bdat = &global_bdat;
struct block *blk;
struct block *blk_;
LIST_HEAD(list);
int ret;
if (!force && bdat->nr_dirty < bdat->nr_events)
return 0;
list_for_each_entry(blk, &bdat->dirty_list, dirty_head) {
list_add_tail(&blk->submit_head, &list);
inc_refcount(blk);
}
ret = submit_and_wait(bdat, &list);
list_for_each_entry_safe(blk, blk_, &list, submit_head) {
list_del_init(&blk->submit_head);
block_put(&blk);
}
if (ret < 0) {
printf("error writing dirty transaction blocks\n");
goto out;
}
ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM | BF_OVERWRITE | BF_DIRTY);
if (ret == 0) {
list_add(&blk->submit_head, &list);
ret = submit_and_wait(bdat, &list);
list_del_init(&blk->submit_head);
block_put(&blk);
} else {
ret = -ENOMEM;
}
if (ret < 0)
printf("error writing super block to commit transaction\n");
out:
rebalance_cache(bdat);
return ret;
}
int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes)
{
struct block_data *bdat = &global_bdat;
size_t i;
int ret;
bdat->max_cached = DIV_ROUND_UP(max_cached_bytes, SCOUTFS_BLOCK_LG_SIZE);
bdat->hash_nr = bdat->max_cached / 4;
bdat->nr_events = DIV_ROUND_UP(max_dirty_bytes, SCOUTFS_BLOCK_LG_SIZE);
bdat->iocbs = calloc(bdat->nr_events, sizeof(bdat->iocbs[0]));
bdat->iocbps = calloc(bdat->nr_events, sizeof(bdat->iocbps[0]));
bdat->events = calloc(bdat->nr_events, sizeof(bdat->events[0]));
bdat->hash_lists = calloc(bdat->hash_nr, sizeof(bdat->hash_lists[0]));
if (!bdat->iocbs || !bdat->iocbps || !bdat->events || !bdat->hash_lists) {
ret = -ENOMEM;
goto out;
}
INIT_LIST_HEAD(&bdat->active_head);
INIT_LIST_HEAD(&bdat->inactive_head);
INIT_LIST_HEAD(&bdat->dirty_list);
bdat->meta_fd = meta_fd;
list_add(&bdat->inactive_head, &bdat->active_head);
for (i = 0; i < bdat->hash_nr; i++)
INIT_LIST_HEAD(&bdat->hash_lists[i]);
ret = syscall(__NR_io_setup, bdat->nr_events, &bdat->ctx);
out:
if (ret < 0) {
free(bdat->iocbs);
free(bdat->iocbps);
free(bdat->events);
free(bdat->hash_lists);
}
return ret;
}
void block_shutdown(void)
{
struct block_data *bdat = &global_bdat;
syscall(SYS_io_destroy, bdat->ctx);
free(bdat->iocbs);
free(bdat->iocbps);
free(bdat->events);
free(bdat->hash_lists);
}
+32
View File
@@ -0,0 +1,32 @@
#ifndef _SCOUTFS_UTILS_CHECK_BLOCK_H_
#define _SCOUTFS_UTILS_CHECK_BLOCK_H_
#include <unistd.h>
#include <stdbool.h>
struct block;
#include "sparse.h"
/* block flags passed to block_get() */
enum {
BF_ZERO = (1 << 0), /* zero contents buf as block is returned */
BF_DIRTY = (1 << 1), /* block will be written with transaction */
BF_SM = (1 << 2), /* small 4k block instead of large 64k block */
BF_OVERWRITE = (1 << 3), /* caller will overwrite contents, don't read */
};
int block_get(struct block **blk_ret, u64 blkno, int bf);
void block_put(struct block **blkp);
void *block_buf(struct block *blk);
size_t block_size(struct block *blk);
void block_drop(struct block **blkp);
void block_readahead(u64 *blknos, size_t nr);
int block_try_commit(bool force);
int block_setup(int meta_fd, size_t max_cached_bytes, size_t max_dirty_bytes);
void block_shutdown(void);
#endif
+209
View File
@@ -0,0 +1,209 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <errno.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "key.h"
#include "avl.h"
#include "block.h"
#include "btree.h"
#include "extent.h"
#include "iter.h"
#include "sns.h"
#include "meta.h"
#include "problem.h"
static inline void *item_val(struct scoutfs_btree_block *bt, struct scoutfs_btree_item *item)
{
return (void *)bt + le16_to_cpu(item->val_off);
}
static void readahead_refs(struct scoutfs_btree_block *bt)
{
struct scoutfs_btree_item *item;
struct scoutfs_avl_node *node;
struct scoutfs_block_ref *ref;
u64 *blknos;
u64 blkno;
u16 valid = 0;
u16 nr = le16_to_cpu(bt->nr_items);
int i;
blknos = calloc(nr, sizeof(blknos[0]));
if (!blknos)
return;
node = avl_first(&bt->item_root);
for (i = 0; i < nr; i++) {
item = container_of(node, struct scoutfs_btree_item, node);
ref = item_val(bt, item);
blkno = le64_to_cpu(ref->blkno);
if (valid_meta_blkno(blkno))
blknos[valid++] = blkno;
node = avl_next(&bt->item_root, &item->node);
}
if (valid > 0)
block_readahead(blknos, valid);
free(blknos);
}
/*
* Call the callback on the referenced block. Then if the block
* contains referneces read it and recurse into all its references.
*/
static int btree_ref_meta_iter(struct scoutfs_block_ref *ref, unsigned level, extent_cb_t cb,
void *cb_arg)
{
struct scoutfs_btree_item *item;
struct scoutfs_btree_block *bt;
struct scoutfs_avl_node *node;
struct block *blk = NULL;
u64 blkno;
int ret;
int i;
blkno = le64_to_cpu(ref->blkno);
if (!blkno)
return 0;
ret = cb(blkno, 1, cb_arg);
if (ret < 0) {
ret = xlate_iter_errno(ret);
return 0;
}
if (level == 0)
return 0;
ret = block_get(&blk, blkno, 0);
if (ret < 0)
return ret;
sns_push("btree_parent", blkno, 0);
bt = block_buf(blk);
/* XXX integrate verification with block cache */
if (bt->level != level) {
problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
ret = -EINVAL;
goto out;
}
/* read-ahead last level of parents */
if (level == 2)
readahead_refs(bt);
node = avl_first(&bt->item_root);
for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
item = container_of(node, struct scoutfs_btree_item, node);
ref = item_val(bt, item);
ret = btree_ref_meta_iter(ref, level - 1, cb, cb_arg);
if (ret < 0)
goto out;
node = avl_next(&bt->item_root, &item->node);
}
ret = 0;
out:
block_put(&blk);
sns_pop();
return ret;
}
int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg)
{
/* XXX check root */
if (root->height == 0)
return 0;
return btree_ref_meta_iter(&root->ref, root->height - 1, cb, cb_arg);
}
static int btree_ref_item_iter(struct scoutfs_block_ref *ref, unsigned level,
btree_item_cb_t cb, void *cb_arg)
{
struct scoutfs_btree_item *item;
struct scoutfs_btree_block *bt;
struct scoutfs_avl_node *node;
struct block *blk = NULL;
u64 blkno;
int ret;
int i;
blkno = le64_to_cpu(ref->blkno);
if (!blkno)
return 0;
ret = block_get(&blk, blkno, 0);
if (ret < 0)
return ret;
if (level)
sns_push("btree_parent", blkno, 0);
else
sns_push("btree_leaf", blkno, 0);
bt = block_buf(blk);
/* XXX integrate verification with block cache */
if (bt->level != level) {
problem(PB_BTREE_BLOCK_BAD_LEVEL, "expected %u level %u", level, bt->level);
ret = -EINVAL;
goto out;
}
/* read-ahead leaves that contain items */
if (level == 1)
readahead_refs(bt);
node = avl_first(&bt->item_root);
for (i = 0; i < le16_to_cpu(bt->nr_items); i++) {
item = container_of(node, struct scoutfs_btree_item, node);
if (level) {
ref = item_val(bt, item);
ret = btree_ref_item_iter(ref, level - 1, cb, cb_arg);
} else {
ret = cb(&item->key, item_val(bt, item),
le16_to_cpu(item->val_len), cb_arg);
debug("free item key "SK_FMT" ret %d", SK_ARG(&item->key), ret);
}
if (ret < 0) {
ret = xlate_iter_errno(ret);
goto out;
}
node = avl_next(&bt->item_root, &item->node);
}
ret = 0;
out:
block_put(&blk);
sns_pop();
return ret;
}
int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg)
{
/* XXX check root */
if (root->height == 0)
return 0;
return btree_ref_item_iter(&root->ref, root->height - 1, cb, cb_arg);
}
+14
View File
@@ -0,0 +1,14 @@
#ifndef _SCOUTFS_UTILS_CHECK_BTREE_H_
#define _SCOUTFS_UTILS_CHECK_BTREE_H_
#include "util.h"
#include "format.h"
#include "extent.h"
typedef int (*btree_item_cb_t)(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg);
int btree_meta_iter(struct scoutfs_btree_root *root, extent_cb_t cb, void *cb_arg);
int btree_item_iter(struct scoutfs_btree_root *root, btree_item_cb_t cb, void *cb_arg);
#endif
+149
View File
@@ -0,0 +1,149 @@
#define _GNU_SOURCE /* O_DIRECT */
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <assert.h>
#include <stdbool.h>
#include <argp.h>
#include "sparse.h"
#include "parse.h"
#include "util.h"
#include "format.h"
#include "ioctl.h"
#include "cmd.h"
#include "dev.h"
#include "alloc.h"
#include "block.h"
#include "debug.h"
#include "meta.h"
#include "super.h"
struct check_args {
char *meta_device;
char *data_device;
char *debug_path;
};
static int do_check(struct check_args *args)
{
int debug_fd = -1;
int meta_fd = -1;
int data_fd = -1;
int ret;
if (args->debug_path) {
debug_fd = open(args->debug_path, O_WRONLY | O_CREAT | O_TRUNC, 0644);
if (debug_fd < 0) {
ret = -errno;
fprintf(stderr, "error opening debug output file '%s': %s (%d)\n",
args->debug_path, strerror(errno), errno);
goto out;
}
debug_enable(debug_fd);
}
meta_fd = open(args->meta_device, O_DIRECT | O_RDWR | O_EXCL);
if (meta_fd < 0) {
ret = -errno;
fprintf(stderr, "failed to open meta device '%s': %s (%d)\n",
args->meta_device, strerror(errno), errno);
goto out;
}
data_fd = open(args->data_device, O_DIRECT | O_RDWR | O_EXCL);
if (data_fd < 0) {
ret = -errno;
fprintf(stderr, "failed to open data device '%s': %s (%d)\n",
args->data_device, strerror(errno), errno);
goto out;
}
ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024);
if (ret < 0)
goto out;
ret = check_supers() ?:
check_meta_alloc();
out:
/* and tear it all down */
block_shutdown();
super_shutdown();
debug_disable();
if (meta_fd >= 0)
close(meta_fd);
if (data_fd >= 0)
close(data_fd);
if (debug_fd >= 0)
close(debug_fd);
return ret;
}
static int parse_opt(int key, char *arg, struct argp_state *state)
{
struct check_args *args = state->input;
switch (key) {
case 'd':
args->debug_path = strdup_or_error(state, arg);
break;
case 'e':
case ARGP_KEY_ARG:
if (!args->meta_device)
args->meta_device = strdup_or_error(state, arg);
else if (!args->data_device)
args->data_device = strdup_or_error(state, arg);
else
argp_error(state, "more than two device arguments given");
break;
case ARGP_KEY_FINI:
if (!args->meta_device)
argp_error(state, "no metadata device argument given");
if (!args->data_device)
argp_error(state, "no data device argument given");
break;
default:
break;
}
return 0;
}
static struct argp_option options[] = {
{ "debug", 'd', "FILE_PATH", 0, "Path to debug output file, will be created or truncated"},
{ NULL }
};
static struct argp argp = {
options,
parse_opt,
"META-DEVICE DATA-DEVICE",
"Check filesystem consistency"
};
static int check_cmd(int argc, char **argv)
{
struct check_args check_args = {NULL};
int ret;
ret = argp_parse(&argp, argc, argv, 0, NULL, &check_args);
if (ret)
return ret;
return do_check(&check_args);
}
static void __attribute__((constructor)) check_ctor(void)
{
cmd_register_argp("check", &argp, GROUP_CORE, check_cmd);
}
+16
View File
@@ -0,0 +1,16 @@
#include <stdlib.h>
#include "debug.h"
int debug_fd = -1;
void debug_enable(int fd)
{
debug_fd = fd;
}
void debug_disable(void)
{
if (debug_fd >= 0)
debug_fd = -1;
}
+17
View File
@@ -0,0 +1,17 @@
#ifndef _SCOUTFS_UTILS_CHECK_DEBUG_H_
#define _SCOUTFS_UTILS_CHECK_DEBUG_H_
#include <stdio.h>
#define debug(fmt, args...) \
do { \
if (debug_fd >= 0) \
dprintf(debug_fd, fmt"\n", ##args); \
} while (0)
extern int debug_fd;
void debug_enable(int fd);
void debug_disable(void);
#endif
+9
View File
@@ -0,0 +1,9 @@
#ifndef _SCOUTFS_UTILS_CHECK_ENO_H_
#define _SCOUTFS_UTILS_CHECK_ENO_H_
#include <errno.h>
#define ENO_FMT "%d (%s)"
#define ENO_ARG(eno) eno, strerror(eno)
#endif
+312
View File
@@ -0,0 +1,312 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <errno.h>
#include "util.h"
#include "lk_rbtree_wrapper.h"
#include "debug.h"
#include "extent.h"
/*
* In-memory extent management in rbtree nodes.
*/
bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len)
{
u64 a_end = a_start + a_len;
u64 b_end = b_start + b_len;
return !((a_end <= b_start) || (b_end <= a_start));
}
static int ext_contains(struct extent_node *ext, u64 start, u64 len)
{
return ext->start <= start && ext->start + ext->len >= start + len;
}
/*
* True if the given extent is bisected by the given range; there's
* leftover containing extents on both the left and right sides of the
* range in the extent.
*/
static int ext_bisected(struct extent_node *ext, u64 start, u64 len)
{
return ext->start < start && ext->start + ext->len > start + len;
}
static struct extent_node *ext_from_rbnode(struct rb_node *rbnode)
{
return rbnode ? container_of(rbnode, struct extent_node, rbnode) : NULL;
}
static struct extent_node *next_ext(struct extent_node *ext)
{
return ext ? ext_from_rbnode(rb_next(&ext->rbnode)) : NULL;
}
static struct extent_node *prev_ext(struct extent_node *ext)
{
return ext ? ext_from_rbnode(rb_prev(&ext->rbnode)) : NULL;
}
struct walk_results {
unsigned bisect_to_leaf:1;
struct extent_node *found;
struct extent_node *next;
struct rb_node *parent;
struct rb_node **node;
};
static void walk_extents(struct extent_root *root, u64 start, u64 len, struct walk_results *wlk)
{
struct rb_node **node = &root->rbroot.rb_node;
struct extent_node *ext;
u64 end = start + len;
int cmp;
wlk->found = NULL;
wlk->next = NULL;
wlk->parent = NULL;
while (*node) {
wlk->parent = *node;
ext = ext_from_rbnode(*node);
cmp = end <= ext->start ? -1 :
start >= ext->start + ext->len ? 1 : 0;
if (cmp < 0) {
node = &ext->rbnode.rb_left;
wlk->next = ext;
} else if (cmp > 0) {
node = &ext->rbnode.rb_right;
} else {
wlk->found = ext;
if (!(wlk->bisect_to_leaf && ext_bisected(ext, start, len)))
break;
/* walk right so we can insert greater right from bisection */
node = &ext->rbnode.rb_right;
}
}
wlk->node = node;
}
/*
* Return an extent that overlaps with the given range.
*/
int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found)
{
struct walk_results wlk = { 0, };
int ret;
walk_extents(root, start, len, &wlk);
if (wlk.found) {
memset(found, 0, sizeof(struct extent_node));
found->start = wlk.found->start;
found->len = wlk.found->len;
ret = 0;
} else {
ret = -ENOENT;
}
return ret;
}
/*
* Callers can iterate through direct node references and are entirely
* responsible for consistency when doing so.
*/
struct extent_node *extent_first(struct extent_root *root)
{
struct walk_results wlk = { 0, };
walk_extents(root, 0, 1, &wlk);
return wlk.found ?: wlk.next;
}
struct extent_node *extent_next(struct extent_node *ext)
{
return next_ext(ext);
}
struct extent_node *extent_prev(struct extent_node *ext)
{
return prev_ext(ext);
}
/*
* Insert a new extent into the tree. We can extend existing nodes,
* merge with neighbours, or remove existing extents entirely if we
* insert a range that fully spans existing nodes.
*/
static int walk_insert(struct extent_root *root, u64 start, u64 len, int found_err)
{
struct walk_results wlk = { 0, };
struct extent_node *ext;
struct extent_node *nei;
int ret;
walk_extents(root, start, len, &wlk);
ext = wlk.found;
if (ext && found_err) {
ret = found_err;
goto out;
}
if (!ext) {
ext = malloc(sizeof(struct extent_node));
if (!ext) {
ret = -ENOMEM;
goto out;
}
ext->start = start;
ext->len = len;
rb_link_node(&ext->rbnode, wlk.parent, wlk.node);
rb_insert_color(&ext->rbnode, &root->rbroot);
}
/* start by expanding an existing extent if our range is larger */
if (start < ext->start) {
ext->len += ext->start - start;
ext->start = start;
}
if (ext->start + ext->len < start + len)
ext->len += (start + len) - (ext->start + ext->len);
/* drop any fully spanned neighbors, possibly merging with a final adjacent one */
while ((nei = prev_ext(ext))) {
if (nei->start + nei->len < ext->start)
break;
if (nei->start < ext->start) {
ext->len += ext->start - nei->start;
ext->start = nei->start;
}
rb_erase(&nei->rbnode, &root->rbroot);
free(nei);
}
while ((nei = next_ext(ext))) {
if (ext->start + ext->len < nei->start)
break;
if (ext->start + ext->len < nei->start + nei->len)
ext->len += (nei->start + nei->len) - (ext->start + ext->len);
rb_erase(&nei->rbnode, &root->rbroot);
free(nei);
}
ret = 0;
out:
debug("start %llu len %llu ret %d", start, len, ret);
return ret;
}
/*
* Insert a new extent. The specified extent must not overlap with any
* existing extents or -EEXIST is returned.
*/
int extent_insert_new(struct extent_root *root, u64 start, u64 len)
{
return walk_insert(root, start, len, true);
}
/*
* Insert an extent, extending any existing extents that may overlap.
*/
int extent_insert_extend(struct extent_root *root, u64 start, u64 len)
{
return walk_insert(root, start, len, false);
}
/*
* Remove the specified extent from an existing node. The given extent must be fully
* contained in a single node or -ENOENT is returned.
*/
int extent_remove(struct extent_root *root, u64 start, u64 len)
{
struct extent_node *ext;
struct extent_node *ins;
struct walk_results wlk = {
.bisect_to_leaf = 1,
};
int ret;
walk_extents(root, start, len, &wlk);
if (!(ext = wlk.found) || !ext_contains(ext, start, len)) {
ret = -ENOENT;
goto out;
}
if (ext_bisected(ext, start, len)) {
debug("found bisected start %llu len %llu", ext->start, ext->len);
ins = malloc(sizeof(struct extent_node));
if (!ins) {
ret = -ENOMEM;
goto out;
}
ins->start = start + len;
ins->len = (ext->start + ext->len) - ins->start;
rb_link_node(&ins->rbnode, wlk.parent, wlk.node);
rb_insert_color(&ins->rbnode, &root->rbroot);
}
if (start > ext->start) {
ext->len = start - ext->start;
} else if (len < ext->len) {
ext->start += len;
ext->len -= len;
} else {
rb_erase(&ext->rbnode, &root->rbroot);
}
ret = 0;
out:
debug("start %llu len %llu ret %d", start, len, ret);
return ret;
}
void extent_root_init(struct extent_root *root)
{
root->rbroot = RB_ROOT;
root->total = 0;
}
void extent_root_free(struct extent_root *root)
{
struct extent_node *ext;
struct rb_node *node;
struct rb_node *tmp;
for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
ext = rb_entry(node, struct extent_node, rbnode);
rb_erase(&ext->rbnode, &root->rbroot);
free(ext);
}
}
void extent_root_print(struct extent_root *root)
{
struct extent_node *ext;
struct rb_node *node;
struct rb_node *tmp;
for (node = rb_first(&root->rbroot); node && ((tmp = rb_next(node)), 1); node = tmp) {
ext = rb_entry(node, struct extent_node, rbnode);
debug(" start %llu len %llu", ext->start, ext->len);
}
}
+38
View File
@@ -0,0 +1,38 @@
#ifndef _SCOUTFS_UTILS_CHECK_EXTENT_H_
#define _SCOUTFS_UTILS_CHECK_EXTENT_H_
#include "lk_rbtree_wrapper.h"
struct extent_root {
struct rb_root rbroot;
u64 total;
};
struct extent_node {
struct rb_node rbnode;
u64 start;
u64 len;
};
typedef int (*extent_cb_t)(u64 start, u64 len, void *arg);
struct extent_cb_arg_t {
extent_cb_t cb;
void *cb_arg;
};
bool extents_overlap(u64 a_start, u64 a_len, u64 b_start, u64 b_len);
int extent_lookup(struct extent_root *root, u64 start, u64 len, struct extent_node *found);
struct extent_node *extent_first(struct extent_root *root);
struct extent_node *extent_next(struct extent_node *ext);
struct extent_node *extent_prev(struct extent_node *ext);
int extent_insert_new(struct extent_root *root, u64 start, u64 len);
int extent_insert_extend(struct extent_root *root, u64 start, u64 len);
int extent_remove(struct extent_root *root, u64 start, u64 len);
void extent_root_init(struct extent_root *root);
void extent_root_free(struct extent_root *root);
void extent_root_print(struct extent_root *root);
#endif
+540
View File
@@ -0,0 +1,540 @@
#define _GNU_SOURCE /* O_DIRECT */
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>
#include <stdbool.h>
#include <argp.h>
#include "sparse.h"
#include "bitmap.h"
#include "parse.h"
#include "util.h"
#include "format.h"
#include "crc.h"
#include "cmd.h"
#include "dev.h"
#include "alloc.h"
#include "block.h"
#include "btree.h"
#include "log_trees.h"
#include "super.h"
/* huh. */
#define OFF_MAX (off_t)((u64)((off_t)~0ULL) >> 1)
#define SCOUTFS_META_IMAGE_HEADER_MAGIC 0x8aee00d098fa60c5ULL
#define SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC 0x70bd5e9269effd86ULL
struct scoutfs_meta_image_header {
__le64 magic;
__le64 total_bytes;
__le32 version;
} __packed;
struct scoutfs_meta_image_block_header {
__le64 magic;
__le64 offset;
__le32 size;
__le32 crc;
} __packed;
struct image_args {
char *meta_device;
bool is_read;
bool show_header;
u64 ra_window;
};
struct block_bitmaps {
unsigned long *bits;
u64 size;
u64 count;
};
#define errf(fmt, args...) \
dprintf(STDERR_FILENO, fmt, ##args)
static int set_meta_bit(u64 start, u64 len, void *arg)
{
struct block_bitmaps *bm = arg;
int ret;
if (len != 1) {
ret = -EINVAL;
} else {
if (!test_bit(bm->bits, start)) {
set_bit(bm->bits, start);
bm->count++;
}
ret = 0;
}
return ret;
}
static int get_ref_bits(struct block_bitmaps *bm)
{
struct scoutfs_super_block *super = global_super;
int ret;
u64 i;
/*
* There are almost no small blocks we need to read, so we read
* them as the large blocks that contain them to simplify the
* block reading process.
*/
set_meta_bit(SCOUTFS_SUPER_BLKNO >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++)
set_meta_bit((SCOUTFS_QUORUM_BLKNO + i) >> SCOUTFS_BLOCK_SM_LG_SHIFT, 1, bm);
ret = alloc_root_meta_iter(&super->meta_alloc[0], set_meta_bit, bm) ?:
alloc_root_meta_iter(&super->meta_alloc[1], set_meta_bit, bm) ?:
alloc_root_meta_iter(&super->data_alloc, set_meta_bit, bm) ?:
alloc_list_meta_iter(&super->server_meta_avail[0], set_meta_bit, bm) ?:
alloc_list_meta_iter(&super->server_meta_avail[1], set_meta_bit, bm) ?:
alloc_list_meta_iter(&super->server_meta_freed[0], set_meta_bit, bm) ?:
alloc_list_meta_iter(&super->server_meta_freed[1], set_meta_bit, bm) ?:
btree_meta_iter(&super->fs_root, set_meta_bit, bm) ?:
btree_meta_iter(&super->logs_root, set_meta_bit, bm) ?:
btree_meta_iter(&super->log_merge, set_meta_bit, bm) ?:
btree_meta_iter(&super->mounted_clients, set_meta_bit, bm) ?:
btree_meta_iter(&super->srch_root, set_meta_bit, bm) ?:
log_trees_meta_iter(set_meta_bit, bm);
return ret;
}
/*
* Note that this temporarily modifies the header that it's given.
*/
static __le32 calc_crc(struct scoutfs_meta_image_block_header *bh, void *buf, size_t size)
{
__le32 saved = bh->crc;
u32 crc = ~0;
bh->crc = 0;
crc = crc32c(crc, bh, sizeof(*bh));
crc = crc32c(crc, buf, size);
bh->crc = saved;
return cpu_to_le32(crc);
}
static void printf_header(struct scoutfs_meta_image_header *hdr)
{
errf("magic: 0x%016llx\n"
"total_bytes: %llu\n"
"version: %u\n",
le64_to_cpu(hdr->magic),
le64_to_cpu(hdr->total_bytes),
le32_to_cpu(hdr->version));
}
typedef ssize_t (*rw_func_t)(int fd, void *buf, size_t count, off_t offset);
static inline ssize_t rw_read(int fd, void *buf, size_t count, off_t offset)
{
return read(fd, buf, count);
}
static inline ssize_t rw_pread(int fd, void *buf, size_t count, off_t offset)
{
return pread(fd, buf, count, offset);
}
static inline ssize_t rw_write(int fd, void *buf, size_t count, off_t offset)
{
return write(fd, buf, count);
}
static inline ssize_t rw_pwrite(int fd, void *buf, size_t count, off_t offset)
{
return pwrite(fd, buf, count, offset);
}
static int rw_full_count(rw_func_t func, u64 *tot, int fd, void *buf, size_t count, off_t offset)
{
ssize_t sret;
while (count > 0) {
sret = func(fd, buf, count, offset);
if (sret <= 0 || sret > count) {
if (sret < 0)
return -errno;
else
return -EIO;
}
if (tot)
*tot += sret;
buf += sret;
count -= sret;
}
return 0;
}
static int read_image(struct image_args *args, int fd, struct block_bitmaps *bm)
{
struct scoutfs_meta_image_block_header bh;
struct scoutfs_meta_image_header hdr;
u64 opening;
void *buf;
off_t off;
u64 bit;
u64 ra;
int ret;
buf = malloc(SCOUTFS_BLOCK_LG_SIZE);
if (!buf) {
ret = -ENOMEM;
goto out;
}
hdr.magic = cpu_to_le64(SCOUTFS_META_IMAGE_HEADER_MAGIC);
hdr.total_bytes = cpu_to_le64(sizeof(hdr) +
(bm->count * (SCOUTFS_BLOCK_LG_SIZE + sizeof(bh))));
hdr.version = cpu_to_le32(1);
if (args->show_header) {
printf_header(&hdr);
ret = 0;
goto out;
}
ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &hdr, sizeof(hdr), 0);
if (ret < 0)
goto out;
opening = args->ra_window;
ra = 0;
bit = 0;
for (bit = 0; (bit = find_next_set_bit(bm->bits, bit, bm->size)) < bm->size; bit++) {
/* readahead to open the full window, then a block at a time */
do {
ra = find_next_set_bit(bm->bits, ra, bm->size);
if (ra < bm->size) {
off = ra << SCOUTFS_BLOCK_LG_SHIFT;
posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_WILLNEED);
ra++;
if (opening)
opening -= min(opening, SCOUTFS_BLOCK_LG_SIZE);
}
} while (opening > 0);
off = bit << SCOUTFS_BLOCK_LG_SHIFT;
ret = rw_full_count(rw_pread, NULL, fd, buf, SCOUTFS_BLOCK_LG_SIZE, off);
if (ret < 0)
goto out;
/*
* Might as well try to drop the pages we've used to
* reduce memory pressure on our read-ahead pages that
* are waiting.
*/
posix_fadvise(fd, off, SCOUTFS_BLOCK_LG_SIZE, POSIX_FADV_DONTNEED);
bh.magic = SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC;
bh.offset = cpu_to_le64(off);
bh.size = cpu_to_le32(SCOUTFS_BLOCK_LG_SIZE);
bh.crc = calc_crc(&bh, buf, SCOUTFS_BLOCK_LG_SIZE);
ret = rw_full_count(rw_write, NULL, STDOUT_FILENO, &bh, sizeof(bh), 0) ?:
rw_full_count(rw_write, NULL, STDOUT_FILENO, buf, SCOUTFS_BLOCK_LG_SIZE, 0);
if (ret < 0)
goto out;
}
out:
free(buf);
return ret;
}
static int invalid_header(struct scoutfs_meta_image_header *hdr)
{
if (le64_to_cpu(hdr->magic) != SCOUTFS_META_IMAGE_HEADER_MAGIC) {
errf("bad image header magic 0x%016llx (!= expected %016llx)\n",
le64_to_cpu(hdr->magic), SCOUTFS_META_IMAGE_HEADER_MAGIC);
} else if (le32_to_cpu(hdr->version) != 1) {
errf("unknown image header version %u\n", le32_to_cpu(hdr->version));
} else {
return 0;
}
return -EIO;
}
/*
* Doesn't catch offset+size overflowing, presumes pwrite() will return
* an error.
*/
static int invalid_block_header(struct scoutfs_meta_image_block_header *bh)
{
if (le64_to_cpu(bh->magic) != SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC) {
errf("bad block header magic 0x%016llx (!= expected %016llx)\n",
le64_to_cpu(bh->magic), SCOUTFS_META_IMAGE_BLOCK_HEADER_MAGIC);
} else if (le32_to_cpu(bh->size) == 0) {
errf("invalid block header size %u\n", le32_to_cpu(bh->size));
} else if (le32_to_cpu(bh->size) > SIZE_MAX) {
errf("block header size %u too large for size_t (> %zu)\n",
le32_to_cpu(bh->size), (size_t)SIZE_MAX);
} else if (le64_to_cpu(bh->offset) > OFF_MAX) {
errf("block header offset %llu too large for off_t (> %llu)\n",
le64_to_cpu(bh->offset), (u64)OFF_MAX);
} else {
return 0;
}
return -EIO;
}
static int write_image(struct image_args *args, int fd, struct block_bitmaps *bm)
{
struct scoutfs_meta_image_block_header bh;
struct scoutfs_meta_image_header hdr;
size_t writeback_batch = (2 * 1024 * 1024);
size_t buf_size;
size_t dirty;
size_t size;
off_t first;
off_t last;
off_t off;
__le32 calc;
void *buf;
u64 tot;
int ret;
tot = 0;
ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &hdr, sizeof(hdr), 0);
if (ret < 0)
goto out;
if (args->show_header) {
printf_header(&hdr);
ret = 0;
goto out;
}
ret = invalid_header(&hdr);
if (ret < 0)
goto out;
dirty = 0;
first = OFF_MAX;
last = 0;
buf = NULL;
buf_size = 0;
while (tot < le64_to_cpu(hdr.total_bytes)) {
ret = rw_full_count(rw_read, &tot, STDIN_FILENO, &bh, sizeof(bh), 0);
if (ret < 0)
goto out;
ret = invalid_block_header(&bh);
if (ret < 0)
goto out;
size = le32_to_cpu(bh.size);
if (buf_size < size) {
buf = realloc(buf, size);
if (!buf) {
ret = -ENOMEM;
goto out;
}
buf_size = size;
}
ret = rw_full_count(rw_read, &tot, STDIN_FILENO, buf, size, 0);
if (ret < 0)
goto out;
calc = calc_crc(&bh, buf, size);
if (calc != bh.crc) {
errf("crc err");
ret = -EIO;
goto out;
}
off = le64_to_cpu(bh.offset);
ret = rw_full_count(rw_pwrite, NULL, fd, buf, size, off);
if (ret < 0)
goto out;
dirty += size;
first = min(first, off);
last = max(last, off);
if (dirty >= writeback_batch) {
posix_fadvise(fd, first, last, POSIX_FADV_DONTNEED);
dirty = 0;
first = OFF_MAX;
last = 0;
}
}
ret = fsync(fd);
if (ret < 0) {
ret = -errno;
goto out;
}
out:
return ret;
}
static int do_image(struct image_args *args)
{
struct block_bitmaps bm = { .bits = NULL };
int meta_fd = -1;
u64 dev_size;
mode_t mode;
int ret;
mode = args->is_read ? O_RDONLY : O_RDWR;
meta_fd = open(args->meta_device, mode);
if (meta_fd < 0) {
ret = -errno;
errf("failed to open meta device '%s': %s (%d)\n",
args->meta_device, strerror(errno), errno);
goto out;
}
if (args->is_read) {
ret = flush_device(meta_fd);
if (ret < 0)
goto out;
ret = get_device_size(args->meta_device, meta_fd, &dev_size);
if (ret < 0)
goto out;
bm.size = DIV_ROUND_UP(dev_size, SCOUTFS_BLOCK_LG_SIZE);
bm.bits = calloc(1, round_up(bm.size, BITS_PER_LONG) / 8);
if (!bm.bits) {
ret = -ENOMEM;
goto out;
}
ret = block_setup(meta_fd, 128 * 1024 * 1024, 32 * 1024 * 1024) ?:
check_supers() ?:
get_ref_bits(&bm) ?:
read_image(args, meta_fd, &bm);
block_shutdown();
} else {
ret = write_image(args, meta_fd, &bm);
}
out:
free(bm.bits);
if (meta_fd >= 0)
close(meta_fd);
return ret;
}
static int parse_opt(int key, char *arg, struct argp_state *state)
{
struct image_args *args = state->input;
int ret;
switch (key) {
case 'h':
args->show_header = true;
break;
case 'r':
ret = parse_u64(arg, &args->ra_window);
if (ret)
argp_error(state, "readahead winddoe parse error");
break;
case ARGP_KEY_ARG:
if (!args->meta_device)
args->meta_device = strdup_or_error(state, arg);
else
argp_error(state, "more than two device arguments given");
break;
case ARGP_KEY_FINI:
if (!args->meta_device)
argp_error(state, "no metadata device argument given");
break;
default:
break;
}
return 0;
}
static struct argp_option options[] = {
{ "show-header", 'h', NULL, 0, "Print image header and exit without processing stream" },
{ "readahead", 'r', "NR", 0, "Maintain read-ahead window of NR blocks" },
{ NULL }
};
static struct argp read_image_argp = {
options,
parse_opt,
"META-DEVICE",
"Read metadata image stream from metadata device file"
};
#define DEFAULT_RA_WINDOW (512 * 1024)
static int read_image_cmd(int argc, char **argv)
{
struct image_args image_args = {
.is_read = true,
.ra_window = DEFAULT_RA_WINDOW,
};
int ret;
ret = argp_parse(&read_image_argp, argc, argv, 0, NULL, &image_args);
if (ret)
return ret;
return do_image(&image_args);
}
static struct argp write_image_argp = {
options,
parse_opt,
"META-DEVICE",
"Write metadata image stream to metadata device file"
};
static int write_image_cmd(int argc, char **argv)
{
struct image_args image_args = {
.is_read = false,
.ra_window = DEFAULT_RA_WINDOW,
};
int ret;
ret = argp_parse(&write_image_argp, argc, argv, 0, NULL, &image_args);
if (ret)
return ret;
return do_image(&image_args);
}
static void __attribute__((constructor)) image_ctor(void)
{
cmd_register_argp("read-metadata-image", &read_image_argp, GROUP_CORE, read_image_cmd);
cmd_register_argp("write-metadata-image", &write_image_argp, GROUP_CORE, write_image_cmd);
}
+15
View File
@@ -0,0 +1,15 @@
#ifndef _SCOUTFS_UTILS_CHECK_ITER_H_
#define _SCOUTFS_UTILS_CHECK_ITER_H_
/*
* Callbacks can return a weird -errno that we'll never use to indicate
* that iteration can stop and return 0 for success.
*/
#define ECHECK_ITER_DONE EL2HLT
static inline int xlate_iter_errno(int ret)
{
return ret == -ECHECK_ITER_DONE ? 0 : ret;
}
#endif
+98
View File
@@ -0,0 +1,98 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "key.h"
#include "alloc.h"
#include "btree.h"
#include "debug.h"
#include "extent.h"
#include "iter.h"
#include "sns.h"
#include "log_trees.h"
#include "super.h"
struct iter_args {
extent_cb_t cb;
void *cb_arg;
};
static int lt_meta_iter(struct scoutfs_key *key, void *val, u16 val_len, void *cb_arg)
{
struct iter_args *ia = cb_arg;
struct scoutfs_log_trees *lt;
int ret;
if (val_len != sizeof(struct scoutfs_log_trees))
; /* XXX */
lt = val;
sns_push("log_trees", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
debug("lt rid 0x%16llx nr %llu", le64_to_cpu(lt->rid), le64_to_cpu(lt->nr));
sns_push("meta_avail", 0, 0);
ret = alloc_list_meta_iter(&lt->meta_avail, ia->cb, ia->cb_arg);
sns_pop();
if (ret < 0)
goto out;
sns_push("meta_freed", 0, 0);
ret = alloc_list_meta_iter(&lt->meta_freed, ia->cb, ia->cb_arg);
sns_pop();
if (ret < 0)
goto out;
sns_push("item_root", 0, 0);
ret = btree_meta_iter(&lt->item_root, ia->cb, ia->cb_arg);
sns_pop();
if (ret < 0)
goto out;
if (lt->bloom_ref.blkno) {
sns_push("bloom_ref", 0, 0);
ret = ia->cb(le64_to_cpu(lt->bloom_ref.blkno), 1, ia->cb_arg);
sns_pop();
if (ret < 0) {
ret = xlate_iter_errno(ret);
goto out;
}
}
sns_push("data_avail", 0, 0);
ret = alloc_root_meta_iter(&lt->data_avail, ia->cb, ia->cb_arg);
sns_pop();
if (ret < 0)
goto out;
sns_push("data_freed", 0, 0);
ret = alloc_root_meta_iter(&lt->data_freed, ia->cb, ia->cb_arg);
sns_pop();
if (ret < 0)
goto out;
ret = 0;
out:
sns_pop();
return ret;
}
/*
* Call the callers callback with the extent of all the metadata block references contained
* in log btrees. We walk the logs_root btree items and walk all the metadata structures
* they reference.
*/
int log_trees_meta_iter(extent_cb_t cb, void *cb_arg)
{
struct scoutfs_super_block *super = global_super;
struct iter_args ia = { .cb = cb, .cb_arg = cb_arg };
return btree_item_iter(&super->logs_root, lt_meta_iter, &ia);
}
+8
View File
@@ -0,0 +1,8 @@
#ifndef _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
#define _SCOUTFS_UTILS_CHECK_LOG_TREES_H_
#include "extent.h"
int log_trees_meta_iter(extent_cb_t cb, void *cb_arg);
#endif
+367
View File
@@ -0,0 +1,367 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <sys/mman.h>
#include <errno.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "bitmap.h"
#include "key.h"
#include "alloc.h"
#include "btree.h"
#include "debug.h"
#include "extent.h"
#include "sns.h"
#include "log_trees.h"
#include "meta.h"
#include "problem.h"
#include "super.h"
static struct meta_data {
struct extent_root meta_refed;
struct extent_root meta_free;
struct {
u64 ref_blocks;
u64 free_extents;
u64 free_blocks;
} stats;
} global_mdat;
bool valid_meta_blkno(u64 blkno)
{
u64 tot = le64_to_cpu(global_super->total_meta_blocks);
return blkno >= SCOUTFS_META_DEV_START_BLKNO && blkno < tot;
}
static bool valid_meta_extent(u64 start, u64 len)
{
u64 tot = le64_to_cpu(global_super->total_meta_blocks);
bool valid;
valid = len > 0 &&
start >= SCOUTFS_META_DEV_START_BLKNO &&
start < tot &&
len <= tot &&
((start + len) <= tot) &&
((start + len) > start);
debug("start %llu len %llu valid %u", start, len, !!valid);
if (!valid)
problem(PB_META_EXTENT_INVALID, "start %llu len %llu", start, len);
return valid;
}
/*
* Track references to individual metadata blocks. This uses the extent
* callback type but is only ever called for single block references.
* Any reference to a block that has already been referenced is
* considered invalid and is ignored. Later repair will resolve
* duplicate references.
*/
static int insert_meta_ref(u64 start, u64 len, void *arg)
{
struct meta_data *mdat = &global_mdat;
struct extent_root *root = arg;
int ret = 0;
/* this is tracking single metadata block references */
if (len != 1) {
ret = -EINVAL;
goto out;
}
if (valid_meta_blkno(start)) {
ret = extent_insert_new(root, start, len);
if (ret == 0)
mdat->stats.ref_blocks++;
else if (ret == -EEXIST)
problem(PB_META_REF_OVERLAPS_EXISTING, "blkno %llu", start);
}
out:
return ret;
}
static int insert_meta_free(u64 start, u64 len, void *arg)
{
struct meta_data *mdat = &global_mdat;
struct extent_root *root = arg;
int ret = 0;
if (valid_meta_extent(start, len)) {
ret = extent_insert_new(root, start, len);
if (ret == 0) {
mdat->stats.free_extents++;
mdat->stats.free_blocks++;
} else if (ret == -EEXIST) {
problem(PB_META_FREE_OVERLAPS_EXISTING,
"start %llu llen %llu", start, len);
}
}
return ret;
}
/*
* Walk all metadata references in the system. This walk doesn't need
* to read metadata that doesn't contain any metadata references so it
* can skip the bulk of metadata blocks. This gives us the set of
* referenced metadata blocks which we can then use to repair metadata
* allocator structures.
*/
static int get_meta_refs(void)
{
struct meta_data *mdat = &global_mdat;
struct scoutfs_super_block *super = global_super;
int ret;
extent_root_init(&mdat->meta_refed);
/* XXX record reserved blocks around super as referenced */
sns_push("meta_alloc", 0, 0);
ret = alloc_root_meta_iter(&super->meta_alloc[0], insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("meta_alloc", 1, 0);
ret = alloc_root_meta_iter(&super->meta_alloc[1], insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("data_alloc", 1, 0);
ret = alloc_root_meta_iter(&super->data_alloc, insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_avail", 0, 0);
ret = alloc_list_meta_iter(&super->server_meta_avail[0],
insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_avail", 1, 0);
ret = alloc_list_meta_iter(&super->server_meta_avail[1],
insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_freed", 0, 0);
ret = alloc_list_meta_iter(&super->server_meta_freed[0],
insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_freed", 1, 0);
ret = alloc_list_meta_iter(&super->server_meta_freed[1],
insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("fs_root", 0, 0);
ret = btree_meta_iter(&super->fs_root, insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("logs_root", 0, 0);
ret = btree_meta_iter(&super->logs_root, insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("log_merge", 0, 0);
ret = btree_meta_iter(&super->log_merge, insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("mounted_clients", 0, 0);
ret = btree_meta_iter(&super->mounted_clients, insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
sns_push("srch_root", 0, 0);
ret = btree_meta_iter(&super->srch_root, insert_meta_ref, &mdat->meta_refed);
sns_pop();
if (ret < 0)
goto out;
ret = log_trees_meta_iter(insert_meta_ref, &mdat->meta_refed);
if (ret < 0)
goto out;
printf("found %llu referenced metadata blocks\n", mdat->stats.ref_blocks);
ret = 0;
out:
return ret;
}
static int get_meta_free(void)
{
struct meta_data *mdat = &global_mdat;
struct scoutfs_super_block *super = global_super;
int ret;
extent_root_init(&mdat->meta_free);
sns_push("meta_alloc", 0, 0);
ret = alloc_root_extent_iter(&super->meta_alloc[0], insert_meta_free, &mdat->meta_free);
sns_pop();
if (ret < 0)
goto out;
sns_push("meta_alloc", 1, 0);
ret = alloc_root_extent_iter(&super->meta_alloc[1], insert_meta_free, &mdat->meta_free);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_avail", 0, 0);
ret = alloc_list_extent_iter(&super->server_meta_avail[0],
insert_meta_free, &mdat->meta_free);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_avail", 1, 0);
ret = alloc_list_extent_iter(&super->server_meta_avail[1],
insert_meta_free, &mdat->meta_free);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_freed", 0, 0);
ret = alloc_list_extent_iter(&super->server_meta_freed[0],
insert_meta_free, &mdat->meta_free);
sns_pop();
if (ret < 0)
goto out;
sns_push("server_meta_freed", 1, 0);
ret = alloc_list_extent_iter(&super->server_meta_freed[1],
insert_meta_free, &mdat->meta_free);
sns_pop();
if (ret < 0)
goto out;
printf("found %llu free metadata blocks in %llu extents\n",
mdat->stats.free_blocks, mdat->stats.free_extents);
ret = 0;
out:
return ret;
}
/*
* All the space between referenced blocks must be recorded in the free
* extents. The free extent walk didn't check that the extents
* overlapped with references, we do that here. Remember that metadata
* block references were merged into extents here, the refed extents
* aren't necessarily all a single block.
*/
static int compare_refs_and_free(void)
{
struct meta_data *mdat = &global_mdat;
struct extent_node *ref;
struct extent_node *free;
struct extent_node *next;
struct extent_node *prev;
u64 expect;
u64 start;
u64 end;
expect = 0;
ref = extent_first(&mdat->meta_refed);
free = extent_first(&mdat->meta_free);
while (ref || free) {
printf("exp %llu ref %llu.%llu free %llu.%llu\n",
expect, ref ? ref->start : 0, ref ? ref->len : 0,
free ? free->start : 0, free ? free->len : 0);
/* referenced marked free, remove ref from free and continue from same point */
if (ref && free && extents_overlap(ref->start, ref->len, free->start, free->len)) {
printf("ref extent %llu.%llu overlaps free %llu %llu\n",
ref->start, ref->len, free->start, free->len);
start = max(ref->start, free->start);
end = min(ref->start + ref->len, free->start + free->len);
prev = extent_prev(free);
extent_remove(&mdat->meta_free, start, end - start);
if (prev)
free = extent_next(prev);
else
free = extent_first(&mdat->meta_free);
continue;
}
/* see which extent starts earlier */
if (!free || (ref && ref->start <= free->start))
next = ref;
else
next = free;
/* untracked region before next extent */
if (expect < next->start) {
printf("missing free extent %llu.%llu\n", expect, next->start - expect);
expect = next->start;
continue;
}
/* didn't overlap, advance past next extent */
expect = next->start + next->len;
if (next == ref)
ref = extent_next(ref);
else
free = extent_next(free);
}
return 0;
}
/*
* Check the metadata allocators by comparing the set of referenced
* blocks with the set of free blocks that are stored in free btree
* items and alloc list blocks.
*/
int check_meta_alloc(void)
{
int ret;
ret = get_meta_refs();
if (ret < 0)
goto out;
ret = get_meta_free();
if (ret < 0)
goto out;
ret = compare_refs_and_free();
if (ret < 0)
goto out;
ret = 0;
out:
return ret;
}
+9
View File
@@ -0,0 +1,9 @@
#ifndef _SCOUTFS_UTILS_CHECK_META_H_
#define _SCOUTFS_UTILS_CHECK_META_H_
bool valid_meta_blkno(u64 blkno);
int check_meta_alloc(void);
#endif
+23
View File
@@ -0,0 +1,23 @@
#include <string.h>
#include <stdbool.h>
#include "util.h"
#include "padding.h"
bool padding_is_zeros(const void *data, size_t sz)
{
static char zeros[32] = {0,};
const size_t batch = array_size(zeros);
while (sz >= batch) {
if (memcmp(data, zeros, batch))
return false;
data += batch;
sz -= batch;
}
if (sz > 0 && memcmp(data, zeros, sz))
return false;
return true;
}
+6
View File
@@ -0,0 +1,6 @@
#ifndef _SCOUTFS_UTILS_CHECK_PADDING_H_
#define _SCOUTFS_UTILS_CHECK_PADDING_H_
bool padding_is_zeros(const void *data, size_t sz);
#endif
+23
View File
@@ -0,0 +1,23 @@
#include <stdio.h>
#include <stdint.h>
#include "problem.h"
#if 0
#define PROB_STR(pb) [pb] = #pb
static char *prob_strs[] = {
PROB_STR(PB_META_EXTENT_INVALID),
PROB_STR(PB_META_EXTENT_OVERLAPS_EXISTING),
};
#endif
static struct problem_data {
uint64_t counts[PB__NR];
} global_pdat;
void problem_record(prob_t pb)
{
struct problem_data *pdat = &global_pdat;
pdat->counts[pb]++;
}
+23
View File
@@ -0,0 +1,23 @@
#ifndef _SCOUTFS_UTILS_CHECK_PROBLEM_H_
#define _SCOUTFS_UTILS_CHECK_PROBLEM_H_
#include "debug.h"
#include "sns.h"
typedef enum {
PB_META_EXTENT_INVALID,
PB_META_REF_OVERLAPS_EXISTING,
PB_META_FREE_OVERLAPS_EXISTING,
PB_BTREE_BLOCK_BAD_LEVEL,
PB__NR,
} prob_t;
#define problem(pb, fmt, ...) \
do { \
debug("problem found: "#pb": %s: "fmt, sns_str(), __VA_ARGS__); \
problem_record(pb); \
} while (0)
void problem_record(prob_t pb);
#endif
+118
View File
@@ -0,0 +1,118 @@
#include <stdlib.h>
#include <string.h>
#include "sns.h"
/*
* This "str num stack" is used to describe our location in metadata at
* any given time.
*
* As we descend into structures we pop a string on decribing them,
* perhaps with associated numbers. Pushing and popping is very cheap
* and only rarely do we format the stack into a string, as an arbitrary
* example:
* super.fs_root.btree_parent:1231.btree_leaf:3231"
*/
#define SNS_MAX_DEPTH 1000
#define SNS_STR_SIZE (SNS_MAX_DEPTH * (SNS_MAX_STR_LEN + 1 + 16 + 1))
static struct sns_data {
unsigned int depth;
struct sns_entry {
char *str;
size_t len;
u64 a;
u64 b;
} ents[SNS_MAX_DEPTH];
char str[SNS_STR_SIZE];
} global_lsdat;
void _sns_push(char *str, size_t len, u64 a, u64 b)
{
struct sns_data *lsdat = &global_lsdat;
if (lsdat->depth < SNS_MAX_DEPTH) {
lsdat->ents[lsdat->depth++] = (struct sns_entry) {
.str = str,
.len = len,
.a = a,
.b = b,
};
}
}
void sns_pop(void)
{
struct sns_data *lsdat = &global_lsdat;
if (lsdat->depth > 0)
lsdat->depth--;
}
static char *append_str(char *pos, char *str, size_t len)
{
memcpy(pos, str, len);
return pos + len;
}
/*
* This is not called for x = 0 so we don't need to emit an initial 0.
* We could by using do {} while instead of while {}.
*/
static char *append_u64x(char *pos, u64 x)
{
static char hex[] = "0123456789abcdef";
while (x) {
*pos++ = hex[x & 0xf];
x >>= 4;
}
return pos;
}
static char *append_char(char *pos, char c)
{
*(pos++) = c;
return pos;
}
/*
* Return a pointer to a null terminated string that describes the
* current location stack. The string buffer is global.
*/
char *sns_str(void)
{
struct sns_data *lsdat = &global_lsdat;
struct sns_entry *ent;
char *pos;
int i;
pos = lsdat->str;
for (i = 0; i < lsdat->depth; i++) {
ent = &lsdat->ents[i];
if (i)
pos = append_char(pos, '.');
pos = append_str(pos, ent->str, ent->len);
if (ent->a) {
pos = append_char(pos, ':');
pos = append_u64x(pos, ent->a);
}
if (ent->b) {
pos = append_char(pos, ':');
pos = append_u64x(pos, ent->b);
}
}
*pos = '\0';
return lsdat->str;
}
+20
View File
@@ -0,0 +1,20 @@
#ifndef _SCOUTFS_UTILS_CHECK_SNS_H_
#define _SCOUTFS_UTILS_CHECK_SNS_H_
#include <assert.h>
#include "sparse.h"
#define SNS_MAX_STR_LEN 20
#define sns_push(str, a, b) \
do { \
build_assert(sizeof(str) - 1 <= SNS_MAX_STR_LEN); \
_sns_push((str), sizeof(str) - 1, a, b); \
} while (0)
void _sns_push(char *str, size_t len, u64 a, u64 b);
void sns_pop(void);
char *sns_str(void);
#endif
+57
View File
@@ -0,0 +1,57 @@
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "block.h"
#include "super.h"
/*
* After we check the super blocks we provide a global buffer to track
* the current super block. It is referenced to get static information
* about the system and is also modified and written as part of
* transactions.
*/
struct scoutfs_super_block *global_super;
/*
* After checking the supers we save a copy of it in a global buffer that's used by
* other modules to track the current super. It can be modified and written during commits.
*/
int check_supers(void)
{
struct scoutfs_super_block *super = NULL;
struct block *blk = NULL;
int ret;
global_super = malloc(sizeof(struct scoutfs_super_block));
if (!global_super) {
printf("error allocating super block buffer\n");
ret = -ENOMEM;
goto out;
}
ret = block_get(&blk, SCOUTFS_SUPER_BLKNO, BF_SM);
if (ret < 0) {
printf("error reading super block\n");
goto out;
}
super = block_buf(blk);
memcpy(global_super, super, sizeof(struct scoutfs_super_block));
ret = 0;
out:
block_put(&blk);
return ret;
}
void super_shutdown(void)
{
free(global_super);
}
+9
View File
@@ -0,0 +1,9 @@
#ifndef _SCOUTFS_UTILS_CHECK_SUPER_H_
#define _SCOUTFS_UTILS_CHECK_SUPER_H_
extern struct scoutfs_super_block *global_super;
int check_supers(void);
void super_shutdown(void);
#endif
+43
View File
@@ -156,6 +156,16 @@ static inline void list_move_tail(struct list_head *list,
list_add_tail(list, head);
}
/**
* list_is_head - tests whether @list is the list @head
* @list: the entry to test
* @head: the head of the list
*/
static inline int list_is_head(const struct list_head *list, const struct list_head *head)
{
return list == head;
}
/**
* list_empty - tests whether a list is empty
* @head: the list to test.
@@ -242,6 +252,15 @@ static inline void list_splice_init(struct list_head *list,
for (pos = (head)->next, n = pos->next; pos != (head); \
pos = n, n = pos->next)
/**
* list_entry_is_head - test if the entry points to the head of the list
* @pos: the type * to cursor
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*/
#define list_entry_is_head(pos, head, member) \
(&pos->member == (head))
/**
* list_for_each_entry - iterate over list of given type
* @pos: the type * to use as a loop counter.
@@ -307,4 +326,28 @@ static inline void list_splice_init(struct list_head *list,
#define list_next_entry(pos, member) \
list_entry((pos)->member.next, typeof(*(pos)), member)
/**
* list_prev_entry - get the prev element in list
* @pos: the type * to cursor
* @member: the name of the list_head within the struct.
*/
#define list_prev_entry(pos, member) \
list_entry((pos)->member.prev, typeof(*(pos)), member)
/**
* list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
* @pos: the type * to use as a loop cursor.
* @n: another type * to use as temporary storage
* @head: the head for your list.
* @member: the name of the list_head within the struct.
*
* Iterate backwards over list of given type, safe against removal
* of list entry.
*/
#define list_for_each_entry_safe_reverse(pos, n, head, member) \
for (pos = list_last_entry(head, typeof(*pos), member), \
n = list_prev_entry(pos, member); \
!list_entry_is_head(pos, head, member); \
pos = n, n = list_prev_entry(n, member))
#endif
-24
View File
@@ -1,24 +0,0 @@
#include <unistd.h>
#include <sys/stat.h>
#include "sparse.h"
#include "util.h"
#include "format.h"
#include "mode_types.h"
unsigned int mode_to_type(mode_t mode)
{
#define S_SHIFT 12
static unsigned char mode_types[S_IFMT >> S_SHIFT] = {
[S_IFIFO >> S_SHIFT] = SCOUTFS_DT_FIFO,
[S_IFCHR >> S_SHIFT] = SCOUTFS_DT_CHR,
[S_IFDIR >> S_SHIFT] = SCOUTFS_DT_DIR,
[S_IFBLK >> S_SHIFT] = SCOUTFS_DT_BLK,
[S_IFREG >> S_SHIFT] = SCOUTFS_DT_REG,
[S_IFLNK >> S_SHIFT] = SCOUTFS_DT_LNK,
[S_IFSOCK >> S_SHIFT] = SCOUTFS_DT_SOCK,
};
return mode_types[(mode & S_IFMT) >> S_SHIFT];
#undef S_SHIFT
}
-6
View File
@@ -1,6 +0,0 @@
#ifndef _MODE_TYPES_H_
#define _MODE_TYPES_H_
unsigned int mode_to_type(mode_t mode);
#endif
-46
View File
@@ -1,46 +0,0 @@
#ifndef _SCOUTFS_NAME_HASH_H_
#define _SCOUTFS_NAME_HASH_H_
#include "hash.h"
/*
* Test a bit number as though an array of bytes is a large len-bit
* big-endian value. nr 0 is the LSB of the final byte, nr (len - 1) is
* the MSB of the first byte.
*/
static int test_be_bytes_bit(int nr, const char *bytes, int len)
{
return bytes[(len - 1 - nr) >> 3] & (1 << (nr & 7));
}
/*
* Generate a 32bit "fingerprint" of the name by extracting 32 evenly
* distributed bits from the name. The intent is to have the sort order
* of the fingerprints reflect the memcmp() sort order of the names
* while mapping large names down to small fs keys.
*
* Names that are smaller than 32bits are biased towards the high bits
* of the fingerprint so that most significant bits of the fingerprints
* consistently reflect the initial characters of the names.
*/
static inline u32 dirent_name_fingerprint(const char *name, unsigned int name_len)
{
int name_bits = name_len * 8;
int skip = max(name_bits / 32, 1);
u32 fp = 0;
int f;
int n;
for (f = 31, n = name_bits - 1; f >= 0 && n >= 0; f--, n -= skip)
fp |= !!test_be_bytes_bit(n, name, name_bits) << f;
return fp;
}
static inline u64 dirent_name_hash(const char *name, unsigned int name_len)
{
return scoutfs_hash32(name, name_len) |
((u64)dirent_name_fingerprint(name, name_len) << 32);
}
#endif
File diff suppressed because it is too large Load Diff
-103
View File
@@ -1,103 +0,0 @@
#ifndef _SCOUTFS_PARALLEL_RESTORE_H_
#define _SCOUTFS_PARALLEL_RESTORE_H_
#include <errno.h>
struct scoutfs_parallel_restore_progress {
struct scoutfs_btree_root fs_items;
struct scoutfs_btree_root root_items;
struct scoutfs_srch_file sfl;
struct scoutfs_block_ref bloom_ref;
__le64 inode_count;
__le64 max_ino;
};
struct scoutfs_parallel_restore_slice {
__le64 fsid;
__le64 meta_start;
__le64 meta_len;
};
struct scoutfs_parallel_restore_entry {
u64 dir_ino;
u64 pos;
u64 ino;
mode_t mode;
char *name;
unsigned int name_len;
};
struct scoutfs_parallel_restore_xattr {
u64 ino;
u64 pos;
char *name;
unsigned int name_len;
void *value;
unsigned int value_len;
};
struct scoutfs_parallel_restore_inode {
/* all inodes */
u64 ino;
u64 nr_xattrs;
u32 uid;
u32 gid;
u32 mode;
u32 rdev;
struct timespec atime;
struct timespec ctime;
struct timespec mtime;
struct timespec crtime;
/* regular files */
u64 data_version;
u64 size;
bool offline;
/* only used for directories */
u64 nr_subdirs;
u64 total_entry_name_bytes;
/* only used for symlnks */
char *target;
unsigned int target_len; /* not including null terminator */
};
typedef __typeof__(EINVAL) spr_err_t;
struct scoutfs_parallel_restore_writer;
spr_err_t scoutfs_parallel_restore_create_writer(struct scoutfs_parallel_restore_writer **wrip);
void scoutfs_parallel_restore_destroy_writer(struct scoutfs_parallel_restore_writer **wrip);
spr_err_t scoutfs_parallel_restore_init_slices(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_slice *slices,
int nr);
spr_err_t scoutfs_parallel_restore_add_slice(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_slice *slice);
spr_err_t scoutfs_parallel_restore_get_slice(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_slice *slice);
spr_err_t scoutfs_parallel_restore_add_inode(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_inode *inode);
spr_err_t scoutfs_parallel_restore_add_entry(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_entry *entry);
spr_err_t scoutfs_parallel_restore_add_xattr(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_xattr *xattr);
spr_err_t scoutfs_parallel_restore_get_progress(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_progress *prog);
spr_err_t scoutfs_parallel_restore_add_progress(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_parallel_restore_progress *prog);
spr_err_t scoutfs_parallel_restore_write_buf(struct scoutfs_parallel_restore_writer *wri,
void *buf, size_t len, off_t *off_ret,
size_t *count_ret);
spr_err_t scoutfs_parallel_restore_import_super(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_super_block *super);
spr_err_t scoutfs_parallel_restore_export_super(struct scoutfs_parallel_restore_writer *wri,
struct scoutfs_super_block *super);
#endif
+31 -19
View File
@@ -609,6 +609,8 @@ static int print_alloc_list_block(int fd, char *str, struct scoutfs_block_ref *r
u64 blkno;
u64 start;
u64 len;
u64 st;
u64 nr;
int wid;
int ret;
int i;
@@ -627,27 +629,37 @@ static int print_alloc_list_block(int fd, char *str, struct scoutfs_block_ref *r
AL_REF_A(&lblk->next), le32_to_cpu(lblk->start),
le32_to_cpu(lblk->nr));
if (lblk->nr) {
wid = printf(" exts: ");
start = 0;
len = 0;
for (i = 0; i < le32_to_cpu(lblk->nr); i++) {
if (len == 0)
start = le64_to_cpu(lblk->blknos[i]);
len++;
if (i == (le32_to_cpu(lblk->nr) - 1) ||
start + len != le64_to_cpu(lblk->blknos[i + 1])) {
if (wid >= 72)
wid = printf("\n ");
wid += printf("%llu,%llu ", start, len);
len = 0;
}
}
printf("\n");
st = le32_to_cpu(lblk->start);
nr = le32_to_cpu(lblk->nr);
if (st >= SCOUTFS_ALLOC_LIST_MAX_BLOCKS ||
nr > SCOUTFS_ALLOC_LIST_MAX_BLOCKS ||
(st + nr) > SCOUTFS_ALLOC_LIST_MAX_BLOCKS) {
printf(" (invalid start and nr fields)\n");
goto out;
}
if (lblk->nr == 0)
goto out;
wid = printf(" exts: ");
start = 0;
len = 0;
for (i = 0; i < nr; i++) {
if (len == 0)
start = le64_to_cpu(lblk->blknos[st + i]);
len++;
if (i == (nr - 1) || (start + len) != le64_to_cpu(lblk->blknos[st + i + 1])) {
if (wid >= 72)
wid = printf("\n ");
wid += printf("%llu,%llu ", start, len);
len = 0;
}
}
printf("\n");
out:
next = lblk->next;
free(lblk);
return print_alloc_list_block(fd, str, &next);
-34
View File
@@ -44,37 +44,3 @@ int srch_decode_entry(void *buf, struct scoutfs_srch_entry *sre,
return tot;
}
static int encode_u64(__le64 *buf, u64 val)
{
int bytes;
val = (val << 1) ^ ((s64)val >> 63); /* shift sign extend */
bytes = (fls64(val) + 7) >> 3;
put_unaligned_le64(val, buf);
return bytes;
}
int srch_encode_entry(void *buf, struct scoutfs_srch_entry *sre, struct scoutfs_srch_entry *prev)
{
u64 diffs[] = {
le64_to_cpu(sre->hash) - le64_to_cpu(prev->hash),
le64_to_cpu(sre->ino) - le64_to_cpu(prev->ino),
le64_to_cpu(sre->id) - le64_to_cpu(prev->id),
};
u16 lengths = 0;
int bytes;
int tot = 2;
int i;
for (i = 0; i < array_size(diffs); i++) {
bytes = encode_u64(buf + tot, diffs[i]);
lengths |= bytes << (i << 2);
tot += bytes;
}
put_unaligned_le16(lengths, buf);
return tot;
}
-1
View File
@@ -3,6 +3,5 @@
int srch_decode_entry(void *buf, struct scoutfs_srch_entry *sre,
struct scoutfs_srch_entry *prev);
int srch_encode_entry(void *buf, struct scoutfs_srch_entry *sre, struct scoutfs_srch_entry *prev);
#endif
-13
View File
@@ -69,8 +69,6 @@ do { \
#define container_of(ptr, type, memb) \
((type *)((void *)(ptr) - offsetof(type, memb)))
#define NSEC_PER_SEC 1000000000
#define BITS_PER_LONG (sizeof(long) * 8)
#define U8_MAX ((u8)~0ULL)
#define U16_MAX ((u16)~0ULL)
@@ -83,7 +81,6 @@ do { \
\
(_x == 0 ? 0 : 64 - __builtin_clzll(_x)); \
})
#define fls64(x) flsll(x)
#define ilog2(x) \
({ \
@@ -101,16 +98,6 @@ emit_get_unaligned_le(16)
emit_get_unaligned_le(32)
emit_get_unaligned_le(64)
#define emit_put_unaligned_le(nr) \
static inline void put_unaligned_le##nr(u##nr val, void *buf) \
{ \
__le##nr x = cpu_to_le##nr(val); \
memcpy(buf, &x, sizeof(x)); \
}
emit_put_unaligned_le(16)
emit_put_unaligned_le(32)
emit_put_unaligned_le(64)
/*
* return -1,0,+1 based on the memcmp comparison of the minimum of their
* two lengths. If their min shared bytes are equal but the lengths