mirror of
https://github.com/versity/scoutfs.git
synced 2026-05-03 11:25:43 +00:00
Compare commits
2 Commits
zab/disabl
...
ben/test
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0b521a943e | ||
|
|
36a3f04566 |
12
README.md
12
README.md
@@ -31,9 +31,15 @@ functionality hasn't been implemented. It's appropriate for early
|
||||
adopters and interested developers, not for production use.
|
||||
|
||||
In that vein, expect significant incompatible changes to both the format
|
||||
of network messages and persistent structures. Since the format hash-checking
|
||||
has now been removed in preparation for release, if there is any doubt, mkfs
|
||||
is strongly recommended.
|
||||
of network messages and persistent structures. To avoid mistakes the
|
||||
implementation currently calculates a hash of the format and ioctl
|
||||
header files in the source tree. The kernel module will refuse to mount
|
||||
a volume created by userspace utilities with a mismatched hash, and it
|
||||
will refuse to connect to a remote node with a mismatched hash. This
|
||||
means having to unmount, mkfs, and remount everything across many
|
||||
functional changes. Once the format is nailed down we'll wire up
|
||||
forward and back compat machinery and remove this temporary safety
|
||||
measure.
|
||||
|
||||
The current kernel module is developed against the RHEL/CentOS 7.x
|
||||
kernel to minimize the friction of developing and testing with partners'
|
||||
|
||||
@@ -16,7 +16,11 @@ SCOUTFS_GIT_DESCRIBE := \
|
||||
$(shell git describe --all --abbrev=6 --long 2>/dev/null || \
|
||||
echo no-git)
|
||||
|
||||
SCOUTFS_FORMAT_HASH := \
|
||||
$(shell cat src/format.h src/ioctl.h | md5sum | cut -b1-16)
|
||||
|
||||
SCOUTFS_ARGS := SCOUTFS_GIT_DESCRIBE=$(SCOUTFS_GIT_DESCRIBE) \
|
||||
SCOUTFS_FORMAT_HASH=$(SCOUTFS_FORMAT_HASH) \
|
||||
CONFIG_SCOUTFS_FS=m -C $(SK_KSRC) M=$(CURDIR)/src \
|
||||
EXTRA_CFLAGS="-Werror"
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o
|
||||
|
||||
CFLAGS_super.o = -DSCOUTFS_GIT_DESCRIBE=\"$(SCOUTFS_GIT_DESCRIBE)\"
|
||||
CFLAGS_super.o = -DSCOUTFS_GIT_DESCRIBE=\"$(SCOUTFS_GIT_DESCRIBE)\" \
|
||||
-DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU
|
||||
|
||||
CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include
|
||||
|
||||
|
||||
@@ -770,13 +770,8 @@ int scoutfs_alloc_data(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret < 0) {
|
||||
/*
|
||||
* Special retval meaning there wasn't space to alloc from
|
||||
* this txn. Doesn't mean filesystem is completely full.
|
||||
* Maybe upper layers want to try again.
|
||||
*/
|
||||
if (ret == -ENOENT)
|
||||
ret = -ENOBUFS;
|
||||
ret = -ENOSPC;
|
||||
*blkno_ret = 0;
|
||||
*count_ret = 0;
|
||||
} else {
|
||||
|
||||
@@ -121,14 +121,16 @@ int scoutfs_client_get_roots(struct super_block *sb,
|
||||
int scoutfs_client_advance_seq(struct super_block *sb, u64 *seq)
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
__le64 leseq;
|
||||
__le64 before = cpu_to_le64p(seq);
|
||||
__le64 after;
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_net_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_ADVANCE_SEQ,
|
||||
NULL, 0, &leseq, sizeof(leseq));
|
||||
&before, sizeof(before),
|
||||
&after, sizeof(after));
|
||||
if (ret == 0)
|
||||
*seq = le64_to_cpu(leseq);
|
||||
*seq = le64_to_cpu(after);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -280,10 +282,10 @@ static int client_greeting(struct super_block *sb,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (gr->version != super->version) {
|
||||
if (gr->format_hash != super->format_hash) {
|
||||
scoutfs_warn(sb, "server sent format 0x%llx, client has 0x%llx",
|
||||
le64_to_cpu(gr->version),
|
||||
le64_to_cpu(super->version));
|
||||
le64_to_cpu(gr->format_hash),
|
||||
le64_to_cpu(super->format_hash));
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
@@ -392,7 +394,7 @@ static void scoutfs_client_connect_worker(struct work_struct *work)
|
||||
|
||||
/* send a greeting to verify endpoints of each connection */
|
||||
greet.fsid = super->hdr.fsid;
|
||||
greet.version = super->version;
|
||||
greet.format_hash = super->format_hash;
|
||||
greet.server_term = cpu_to_le64(client->server_term);
|
||||
greet.unmount_barrier = cpu_to_le64(client->greeting_umb);
|
||||
greet.rid = cpu_to_le64(sbi->rid);
|
||||
|
||||
315
kmod/src/count.h
Normal file
315
kmod/src/count.h
Normal file
@@ -0,0 +1,315 @@
|
||||
#ifndef _SCOUTFS_COUNT_H_
|
||||
#define _SCOUTFS_COUNT_H_
|
||||
|
||||
/*
|
||||
* Our estimate of the space consumed while dirtying items is based on
|
||||
* the number of items and the size of their values.
|
||||
*
|
||||
* The estimate is still a read-only input to entering the transaction.
|
||||
* We'd like to use it as a clean rhs arg to hold_trans. We define SIC_
|
||||
* functions which return the count struct. This lets us have a single
|
||||
* arg and avoid bugs in initializing and passing in struct pointers
|
||||
* from callers. The internal __count functions are used compose an
|
||||
* estimate out of the sets of items it manipulates. We program in much
|
||||
* clearer C instead of in the preprocessor.
|
||||
*
|
||||
* Compilers are able to collapse the inlines into constants for the
|
||||
* constant estimates.
|
||||
*/
|
||||
|
||||
struct scoutfs_item_count {
|
||||
signed items;
|
||||
signed vals;
|
||||
};
|
||||
|
||||
/* The caller knows exactly what they're doing. */
|
||||
static inline const struct scoutfs_item_count SIC_EXACT(signed items,
|
||||
signed vals)
|
||||
{
|
||||
struct scoutfs_item_count cnt = {
|
||||
.items = items,
|
||||
.vals = vals,
|
||||
};
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocating an inode creates a new set of indexed items.
|
||||
*/
|
||||
static inline void __count_alloc_inode(struct scoutfs_item_count *cnt)
|
||||
{
|
||||
const int nr_indices = SCOUTFS_INODE_INDEX_NR;
|
||||
|
||||
cnt->items += 1 + nr_indices;
|
||||
cnt->vals += sizeof(struct scoutfs_inode);
|
||||
}
|
||||
|
||||
/*
|
||||
* Dirtying an inode dirties the inode item and can delete and create
|
||||
* the full set of indexed items.
|
||||
*/
|
||||
static inline void __count_dirty_inode(struct scoutfs_item_count *cnt)
|
||||
{
|
||||
const int nr_indices = 2 * SCOUTFS_INODE_INDEX_NR;
|
||||
|
||||
cnt->items += 1 + nr_indices;
|
||||
cnt->vals += sizeof(struct scoutfs_inode);
|
||||
}
|
||||
|
||||
static inline const struct scoutfs_item_count SIC_ALLOC_INODE(void)
|
||||
{
|
||||
struct scoutfs_item_count cnt = {0,};
|
||||
|
||||
__count_alloc_inode(&cnt);
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
static inline const struct scoutfs_item_count SIC_DIRTY_INODE(void)
|
||||
{
|
||||
struct scoutfs_item_count cnt = {0,};
|
||||
|
||||
__count_dirty_inode(&cnt);
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* Directory entries are stored in three items.
|
||||
*/
|
||||
static inline void __count_dirents(struct scoutfs_item_count *cnt,
|
||||
unsigned name_len)
|
||||
{
|
||||
cnt->items += 3;
|
||||
cnt->vals += 3 * offsetof(struct scoutfs_dirent, name[name_len]);
|
||||
}
|
||||
|
||||
static inline void __count_sym_target(struct scoutfs_item_count *cnt,
|
||||
unsigned size)
|
||||
{
|
||||
unsigned nr = DIV_ROUND_UP(size, SCOUTFS_MAX_VAL_SIZE);
|
||||
|
||||
cnt->items += nr;
|
||||
cnt->vals += size;
|
||||
}
|
||||
|
||||
static inline void __count_orphan(struct scoutfs_item_count *cnt)
|
||||
{
|
||||
|
||||
cnt->items += 1;
|
||||
}
|
||||
|
||||
static inline void __count_mknod(struct scoutfs_item_count *cnt,
|
||||
unsigned name_len)
|
||||
{
|
||||
__count_alloc_inode(cnt);
|
||||
__count_dirents(cnt, name_len);
|
||||
__count_dirty_inode(cnt);
|
||||
}
|
||||
|
||||
static inline const struct scoutfs_item_count SIC_MKNOD(unsigned name_len)
|
||||
{
|
||||
struct scoutfs_item_count cnt = {0,};
|
||||
|
||||
__count_mknod(&cnt, name_len);
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* Dropping the inode deletes all its items. Potentially enormous numbers
|
||||
* of items (data mapping, xattrs) are deleted in their own transactions.
|
||||
*/
|
||||
static inline const struct scoutfs_item_count SIC_DROP_INODE(int mode,
|
||||
u64 size)
|
||||
{
|
||||
struct scoutfs_item_count cnt = {0,};
|
||||
|
||||
if (S_ISLNK(mode))
|
||||
__count_sym_target(&cnt, size);
|
||||
__count_dirty_inode(&cnt);
|
||||
__count_orphan(&cnt);
|
||||
|
||||
cnt.vals = 0;
|
||||
return cnt;
|
||||
}
|
||||
|
||||
static inline const struct scoutfs_item_count SIC_LINK(unsigned name_len)
|
||||
{
|
||||
struct scoutfs_item_count cnt = {0,};
|
||||
|
||||
__count_dirents(&cnt, name_len);
|
||||
__count_dirty_inode(&cnt);
|
||||
__count_dirty_inode(&cnt);
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* Unlink can add orphan items.
|
||||
*/
|
||||
static inline const struct scoutfs_item_count SIC_UNLINK(unsigned name_len)
|
||||
{
|
||||
struct scoutfs_item_count cnt = {0,};
|
||||
|
||||
__count_dirents(&cnt, name_len);
|
||||
__count_dirty_inode(&cnt);
|
||||
__count_dirty_inode(&cnt);
|
||||
__count_orphan(&cnt);
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
static inline const struct scoutfs_item_count SIC_SYMLINK(unsigned name_len,
|
||||
unsigned size)
|
||||
{
|
||||
struct scoutfs_item_count cnt = {0,};
|
||||
|
||||
__count_mknod(&cnt, name_len);
|
||||
__count_sym_target(&cnt, size);
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* This assumes the worst case of a rename between directories that
|
||||
* unlinks an existing target. That'll be worse than the common case
|
||||
* by a few hundred bytes.
|
||||
*/
|
||||
static inline const struct scoutfs_item_count SIC_RENAME(unsigned old_len,
|
||||
unsigned new_len)
|
||||
{
|
||||
struct scoutfs_item_count cnt = {0,};
|
||||
|
||||
/* dirty dirs and inodes */
|
||||
__count_dirty_inode(&cnt);
|
||||
__count_dirty_inode(&cnt);
|
||||
__count_dirty_inode(&cnt);
|
||||
__count_dirty_inode(&cnt);
|
||||
|
||||
/* unlink old and new, link new */
|
||||
__count_dirents(&cnt, old_len);
|
||||
__count_dirents(&cnt, new_len);
|
||||
__count_dirents(&cnt, new_len);
|
||||
|
||||
/* orphan the existing target */
|
||||
__count_orphan(&cnt);
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* Creating an xattr results in a dirty set of items with values that
|
||||
* store the xattr header, name, and value. There's always at least one
|
||||
* item with the header and name. Any previously existing items are
|
||||
* deleted which dirties their key but removes their value. The two
|
||||
* sets of items are indexed by different ids so their items don't
|
||||
* overlap.
|
||||
*/
|
||||
static inline const struct scoutfs_item_count SIC_XATTR_SET(unsigned old_parts,
|
||||
bool creating,
|
||||
unsigned name_len,
|
||||
unsigned size)
|
||||
{
|
||||
struct scoutfs_item_count cnt = {0,};
|
||||
unsigned int new_parts;
|
||||
|
||||
__count_dirty_inode(&cnt);
|
||||
|
||||
if (old_parts)
|
||||
cnt.items += old_parts;
|
||||
|
||||
if (creating) {
|
||||
new_parts = SCOUTFS_XATTR_NR_PARTS(name_len, size);
|
||||
|
||||
cnt.items += new_parts;
|
||||
cnt.vals += sizeof(struct scoutfs_xattr) + name_len + size;
|
||||
}
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* write_begin can have to allocate all the blocks in the page and can
|
||||
* have to add a big allocation from the server to do so:
|
||||
* - merge added free extents from the server
|
||||
* - remove a free extent per block
|
||||
* - remove an offline extent for every other block
|
||||
* - add a file extent per block
|
||||
*/
|
||||
static inline const struct scoutfs_item_count SIC_WRITE_BEGIN(void)
|
||||
{
|
||||
struct scoutfs_item_count cnt = {0,};
|
||||
unsigned nr_free = (1 + SCOUTFS_BLOCK_SM_PER_PAGE) * 3;
|
||||
unsigned nr_file = (DIV_ROUND_UP(SCOUTFS_BLOCK_SM_PER_PAGE, 2) +
|
||||
SCOUTFS_BLOCK_SM_PER_PAGE) * 3;
|
||||
|
||||
__count_dirty_inode(&cnt);
|
||||
|
||||
cnt.items += nr_free + nr_file;
|
||||
cnt.vals += nr_file;
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* Truncating an extent can:
|
||||
* - delete existing file extent,
|
||||
* - create two surrounding file extents,
|
||||
* - add an offline file extent,
|
||||
* - delete two existing free extents
|
||||
* - create a merged free extent
|
||||
*/
|
||||
static inline const struct scoutfs_item_count
|
||||
SIC_TRUNC_EXTENT(struct inode *inode)
|
||||
{
|
||||
struct scoutfs_item_count cnt = {0,};
|
||||
unsigned int nr_file = 1 + 2 + 1;
|
||||
unsigned int nr_free = (2 + 1) * 2;
|
||||
|
||||
if (inode)
|
||||
__count_dirty_inode(&cnt);
|
||||
|
||||
cnt.items += nr_file + nr_free;
|
||||
cnt.vals += nr_file;
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fallocating an extent can, at most:
|
||||
* - allocate from the server: delete two free and insert merged
|
||||
* - free an allocated extent: delete one and create two split
|
||||
* - remove an unallocated file extent: delete one and create two split
|
||||
* - add an fallocated flie extent: delete two and inset one merged
|
||||
*/
|
||||
static inline const struct scoutfs_item_count SIC_FALLOCATE_ONE(void)
|
||||
{
|
||||
struct scoutfs_item_count cnt = {0,};
|
||||
unsigned int nr_free = ((1 + 2) * 2) * 2;
|
||||
unsigned int nr_file = (1 + 2) * 2;
|
||||
|
||||
__count_dirty_inode(&cnt);
|
||||
|
||||
cnt.items += nr_free + nr_file;
|
||||
cnt.vals += nr_file;
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
/*
|
||||
* ioc_setattr_more can dirty the inode and add a single offline extent.
|
||||
*/
|
||||
static inline const struct scoutfs_item_count SIC_SETATTR_MORE(void)
|
||||
{
|
||||
struct scoutfs_item_count cnt = {0,};
|
||||
|
||||
__count_dirty_inode(&cnt);
|
||||
|
||||
cnt.items++;
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -58,8 +58,6 @@
|
||||
EXPAND_COUNTER(corrupt_symlink_inode_size) \
|
||||
EXPAND_COUNTER(corrupt_symlink_missing_item) \
|
||||
EXPAND_COUNTER(corrupt_symlink_not_null_term) \
|
||||
EXPAND_COUNTER(data_fallocate_enobufs_retry) \
|
||||
EXPAND_COUNTER(data_write_begin_enobufs_retry) \
|
||||
EXPAND_COUNTER(dentry_revalidate_error) \
|
||||
EXPAND_COUNTER(dentry_revalidate_invalid) \
|
||||
EXPAND_COUNTER(dentry_revalidate_locked) \
|
||||
|
||||
@@ -37,6 +37,7 @@
|
||||
#include "lock.h"
|
||||
#include "file.h"
|
||||
#include "msg.h"
|
||||
#include "count.h"
|
||||
#include "ext.h"
|
||||
#include "util.h"
|
||||
|
||||
@@ -290,6 +291,7 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
|
||||
u64 ino, u64 iblock, u64 last, bool offline,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
struct scoutfs_item_count cnt = SIC_TRUNC_EXTENT(inode);
|
||||
struct scoutfs_inode_info *si = NULL;
|
||||
LIST_HEAD(ind_locks);
|
||||
s64 ret = 0;
|
||||
@@ -313,9 +315,9 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
|
||||
while (iblock <= last) {
|
||||
if (inode)
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks,
|
||||
true);
|
||||
true, cnt);
|
||||
else
|
||||
ret = scoutfs_hold_trans(sb);
|
||||
ret = scoutfs_hold_trans(sb, cnt);
|
||||
if (ret)
|
||||
break;
|
||||
|
||||
@@ -751,13 +753,13 @@ static int scoutfs_write_begin(struct file *file,
|
||||
goto out;
|
||||
}
|
||||
|
||||
retry:
|
||||
do {
|
||||
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, &wbd->ind_locks, inode,
|
||||
true) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &wbd->ind_locks,
|
||||
ind_seq);
|
||||
ind_seq,
|
||||
SIC_WRITE_BEGIN());
|
||||
} while (ret > 0);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
@@ -766,22 +768,17 @@ retry:
|
||||
flags |= AOP_FLAG_NOFS;
|
||||
|
||||
/* generic write_end updates i_size and calls dirty_inode */
|
||||
ret = scoutfs_dirty_inode_item(inode, wbd->lock) ?:
|
||||
block_write_begin(mapping, pos, len, flags, pagep,
|
||||
scoutfs_get_block_write);
|
||||
if (ret < 0) {
|
||||
ret = scoutfs_dirty_inode_item(inode, wbd->lock);
|
||||
if (ret == 0)
|
||||
ret = block_write_begin(mapping, pos, len, flags, pagep,
|
||||
scoutfs_get_block_write);
|
||||
if (ret)
|
||||
scoutfs_release_trans(sb);
|
||||
scoutfs_inode_index_unlock(sb, &wbd->ind_locks);
|
||||
if (ret == -ENOBUFS) {
|
||||
/* Retry with a new transaction. */
|
||||
scoutfs_inc_counter(sb, data_write_begin_enobufs_retry);
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
if (ret < 0)
|
||||
if (ret) {
|
||||
scoutfs_inode_index_unlock(sb, &wbd->ind_locks);
|
||||
kfree(wbd);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1010,7 +1007,8 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
|
||||
while(iblock <= last) {
|
||||
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false,
|
||||
SIC_FALLOCATE_ONE());
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -1028,12 +1026,6 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
|
||||
scoutfs_release_trans(sb);
|
||||
scoutfs_inode_index_unlock(sb, &ind_locks);
|
||||
|
||||
/* txn couldn't meet the request. Let's try with a new txn */
|
||||
if (ret == -ENOBUFS) {
|
||||
scoutfs_inc_counter(sb, data_fallocate_enobufs_retry);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ret <= 0)
|
||||
goto out;
|
||||
|
||||
@@ -1086,7 +1078,8 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
|
||||
}
|
||||
|
||||
/* we're updating meta_seq with offline block count */
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false,
|
||||
SIC_SETATTR_MORE());
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
@@ -1231,7 +1224,8 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
|
||||
ret = scoutfs_inode_index_start(sb, &seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, &locks, from, true) ?:
|
||||
scoutfs_inode_index_prepare(sb, &locks, to, true) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &locks, seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, &locks, seq,
|
||||
SIC_EXACT(1, 1));
|
||||
if (ret > 0)
|
||||
continue;
|
||||
if (ret < 0)
|
||||
|
||||
@@ -463,18 +463,7 @@ out:
|
||||
else
|
||||
inode = scoutfs_iget(sb, ino);
|
||||
|
||||
/*
|
||||
* We can't splice dir aliases into the dcache. dir entries
|
||||
* might have changed on other nodes so our dcache could still
|
||||
* contain them, rather than having been moved in rename. For
|
||||
* dirs, we use d_materialize_unique to remove any existing
|
||||
* aliases which must be stale. Our inode numbers aren't reused
|
||||
* so inodes pointed to by entries can't change types.
|
||||
*/
|
||||
if (!IS_ERR_OR_NULL(inode) && S_ISDIR(inode->i_mode))
|
||||
return d_materialise_unique(dentry, inode);
|
||||
else
|
||||
return d_splice_alias(inode, dentry);
|
||||
return d_splice_alias(inode, dentry);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -666,6 +655,7 @@ static int del_entry_items(struct super_block *sb, u64 dir_ino, u64 hash,
|
||||
*/
|
||||
static struct inode *lock_hold_create(struct inode *dir, struct dentry *dentry,
|
||||
umode_t mode, dev_t rdev,
|
||||
const struct scoutfs_item_count cnt,
|
||||
struct scoutfs_lock **dir_lock,
|
||||
struct scoutfs_lock **inode_lock,
|
||||
struct list_head *ind_locks)
|
||||
@@ -704,7 +694,7 @@ retry:
|
||||
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, ind_locks, dir, true) ?:
|
||||
scoutfs_inode_index_prepare_ino(sb, ind_locks, ino, mode) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, ind_locks, ind_seq, cnt);
|
||||
if (ret > 0)
|
||||
goto retry;
|
||||
if (ret)
|
||||
@@ -751,6 +741,7 @@ static int scoutfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
|
||||
|
||||
hash = dirent_name_hash(dentry->d_name.name, dentry->d_name.len);
|
||||
inode = lock_hold_create(dir, dentry, mode, rdev,
|
||||
SIC_MKNOD(dentry->d_name.len),
|
||||
&dir_lock, &inode_lock, &ind_locks);
|
||||
if (IS_ERR(inode))
|
||||
return PTR_ERR(inode);
|
||||
@@ -845,7 +836,8 @@ retry:
|
||||
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
|
||||
scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
|
||||
SIC_LINK(dentry->d_name.len));
|
||||
if (ret > 0)
|
||||
goto retry;
|
||||
if (ret)
|
||||
@@ -926,7 +918,8 @@ retry:
|
||||
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, &ind_locks, dir, false) ?:
|
||||
scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
|
||||
SIC_UNLINK(dentry->d_name.len));
|
||||
if (ret > 0)
|
||||
goto retry;
|
||||
if (ret)
|
||||
@@ -1161,6 +1154,7 @@ static int scoutfs_symlink(struct inode *dir, struct dentry *dentry,
|
||||
return ret;
|
||||
|
||||
inode = lock_hold_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0,
|
||||
SIC_SYMLINK(dentry->d_name.len, name_len),
|
||||
&dir_lock, &inode_lock, &ind_locks);
|
||||
if (IS_ERR(inode))
|
||||
return PTR_ERR(inode);
|
||||
@@ -1592,7 +1586,9 @@ retry:
|
||||
scoutfs_inode_index_prepare(sb, &ind_locks, new_dir, false)) ?:
|
||||
(new_inode == NULL ? 0 :
|
||||
scoutfs_inode_index_prepare(sb, &ind_locks, new_inode, false)) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
|
||||
SIC_RENAME(old_dentry->d_name.len,
|
||||
new_dentry->d_name.len));
|
||||
if (ret > 0)
|
||||
goto retry;
|
||||
if (ret)
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
#ifndef _SCOUTFS_FORMAT_H_
|
||||
#define _SCOUTFS_FORMAT_H_
|
||||
|
||||
#define SCOUTFS_INTEROP_VERSION 0ULL
|
||||
#define SCOUTFS_INTEROP_VERSION_STR __stringify(0)
|
||||
|
||||
/* statfs(2) f_type */
|
||||
#define SCOUTFS_SUPER_MAGIC 0x554f4353 /* "SCOU" */
|
||||
|
||||
@@ -176,6 +173,19 @@ struct scoutfs_key {
|
||||
#define skfl_neglen _sk_second
|
||||
#define skfl_blkno _sk_third
|
||||
|
||||
struct scoutfs_radix_block {
|
||||
struct scoutfs_block_header hdr;
|
||||
union {
|
||||
struct scoutfs_radix_ref {
|
||||
__le64 blkno;
|
||||
__le64 seq;
|
||||
__le64 sm_total;
|
||||
__le64 lg_total;
|
||||
} refs[0];
|
||||
__le64 bits[0];
|
||||
};
|
||||
};
|
||||
|
||||
struct scoutfs_avl_root {
|
||||
__le16 node;
|
||||
};
|
||||
@@ -586,7 +596,7 @@ struct scoutfs_quorum_block {
|
||||
struct scoutfs_super_block {
|
||||
struct scoutfs_block_header hdr;
|
||||
__le64 id;
|
||||
__le64 version;
|
||||
__le64 format_hash;
|
||||
__le64 flags;
|
||||
__u8 uuid[SCOUTFS_UUID_BYTES];
|
||||
__le64 next_ino;
|
||||
@@ -749,7 +759,7 @@ enum scoutfs_dentry_type {
|
||||
*/
|
||||
struct scoutfs_net_greeting {
|
||||
__le64 fsid;
|
||||
__le64 version;
|
||||
__le64 format_hash;
|
||||
__le64 server_term;
|
||||
__le64 unmount_barrier;
|
||||
__le64 rid;
|
||||
|
||||
@@ -343,7 +343,8 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
|
||||
if (!S_ISREG(inode->i_mode))
|
||||
return 0;
|
||||
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true);
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true,
|
||||
SIC_DIRTY_INODE());
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@@ -370,7 +371,8 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)
|
||||
LIST_HEAD(ind_locks);
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false,
|
||||
SIC_DIRTY_INODE());
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@@ -485,7 +487,8 @@ retry:
|
||||
}
|
||||
}
|
||||
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false);
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, false,
|
||||
SIC_DIRTY_INODE());
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -1186,7 +1189,8 @@ int scoutfs_inode_index_start(struct super_block *sb, u64 *seq)
|
||||
* Returns > 0 if the seq changed and the locks should be retried.
|
||||
*/
|
||||
int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
|
||||
struct list_head *list, u64 seq)
|
||||
struct list_head *list, u64 seq,
|
||||
const struct scoutfs_item_count cnt)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct index_lock *ind_lock;
|
||||
@@ -1202,7 +1206,7 @@ int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_hold_trans(sb);
|
||||
ret = scoutfs_hold_trans(sb, cnt);
|
||||
if (ret == 0 && seq != sbi->trans_seq) {
|
||||
scoutfs_release_trans(sb);
|
||||
ret = 1;
|
||||
@@ -1216,7 +1220,8 @@ out:
|
||||
}
|
||||
|
||||
int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
|
||||
bool set_data_seq)
|
||||
bool set_data_seq,
|
||||
const struct scoutfs_item_count cnt)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
int ret;
|
||||
@@ -1226,7 +1231,7 @@ int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
|
||||
ret = scoutfs_inode_index_start(sb, &seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, list, inode,
|
||||
set_data_seq) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, list, seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, list, seq, cnt);
|
||||
} while (ret > 0);
|
||||
|
||||
return ret;
|
||||
@@ -1494,7 +1499,8 @@ static int delete_inode_items(struct super_block *sb, u64 ino)
|
||||
retry:
|
||||
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
|
||||
prepare_index_deletion(sb, &ind_locks, ino, mode, &sinode) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
|
||||
SIC_DROP_INODE(mode, size));
|
||||
if (ret > 0)
|
||||
goto retry;
|
||||
if (ret)
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include "key.h"
|
||||
#include "lock.h"
|
||||
#include "per_task.h"
|
||||
#include "count.h"
|
||||
#include "format.h"
|
||||
#include "data.h"
|
||||
|
||||
@@ -82,9 +83,11 @@ int scoutfs_inode_index_prepare_ino(struct super_block *sb,
|
||||
struct list_head *list, u64 ino,
|
||||
umode_t mode);
|
||||
int scoutfs_inode_index_try_lock_hold(struct super_block *sb,
|
||||
struct list_head *list, u64 seq);
|
||||
struct list_head *list, u64 seq,
|
||||
const struct scoutfs_item_count cnt);
|
||||
int scoutfs_inode_index_lock_hold(struct inode *inode, struct list_head *list,
|
||||
bool set_data_seq);
|
||||
bool set_data_seq,
|
||||
const struct scoutfs_item_count cnt);
|
||||
void scoutfs_inode_index_unlock(struct super_block *sb, struct list_head *list);
|
||||
|
||||
int scoutfs_dirty_inode_item(struct inode *inode, struct scoutfs_lock *lock);
|
||||
|
||||
@@ -674,7 +674,8 @@ static long scoutfs_ioc_setattr_more(struct file *file, unsigned long arg)
|
||||
|
||||
/* setting only so we don't see 0 data seq with nonzero data_version */
|
||||
set_data_seq = sm.data_version != 0 ? true : false;
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq);
|
||||
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, set_data_seq,
|
||||
SIC_SETATTR_MORE());
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
|
||||
@@ -1339,10 +1339,7 @@ static int read_page_item(struct super_block *sb, struct scoutfs_key *key,
|
||||
/* split needs multiple items, sparse may not have enough */
|
||||
if (!left)
|
||||
return -ENOMEM;
|
||||
|
||||
compact_page_items(sb, pg, left);
|
||||
found = item_rbtree_walk(&pg->item_root, key, NULL, &par,
|
||||
&pnode);
|
||||
}
|
||||
|
||||
item = alloc_item(pg, key, liv, val, val_len);
|
||||
@@ -1494,8 +1491,6 @@ retry:
|
||||
rbtree_erase(&rd->node, &root);
|
||||
rbtree_insert(&rd->node, par, pnode, &cinf->pg_root);
|
||||
lru_accessed(sb, cinf, rd);
|
||||
trace_scoutfs_item_read_page(sb, key, &rd->start,
|
||||
&rd->end);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -2347,8 +2342,6 @@ retry:
|
||||
write_lock(&pg->rwlock);
|
||||
|
||||
pgi = trim_page_intersection(sb, cinf, pg, right, start, end);
|
||||
trace_scoutfs_item_invalidate_page(sb, start, end,
|
||||
&pg->start, &pg->end, pgi);
|
||||
BUG_ON(pgi == PGI_DISJOINT); /* walk wouldn't ret disjoint */
|
||||
|
||||
if (pgi == PGI_INSIDE) {
|
||||
@@ -2371,9 +2364,9 @@ retry:
|
||||
/* inv was entirely inside page, done after bisect */
|
||||
write_trylock_will_succeed(&right->rwlock);
|
||||
rbtree_insert(&right->node, par, pnode, &cinf->pg_root);
|
||||
lru_accessed(sb, cinf, right);
|
||||
write_unlock(&right->rwlock);
|
||||
write_unlock(&pg->rwlock);
|
||||
lru_accessed(sb, cinf, right);
|
||||
right = NULL;
|
||||
break;
|
||||
}
|
||||
@@ -2403,6 +2396,7 @@ static int item_lru_shrink(struct shrinker *shrink,
|
||||
struct active_reader *active;
|
||||
struct cached_page *tmp;
|
||||
struct cached_page *pg;
|
||||
LIST_HEAD(list);
|
||||
int nr;
|
||||
|
||||
if (sc->nr_to_scan == 0)
|
||||
@@ -2439,17 +2433,21 @@ static int item_lru_shrink(struct shrinker *shrink,
|
||||
|
||||
__lru_remove(sb, cinf, pg);
|
||||
rbtree_erase(&pg->node, &cinf->pg_root);
|
||||
list_move_tail(&pg->lru_head, &list);
|
||||
invalidate_pcpu_page(pg);
|
||||
write_unlock(&pg->rwlock);
|
||||
|
||||
put_pg(sb, pg);
|
||||
|
||||
if (--nr == 0)
|
||||
break;
|
||||
}
|
||||
|
||||
write_unlock(&cinf->rwlock);
|
||||
spin_unlock(&cinf->lru_lock);
|
||||
|
||||
list_for_each_entry_safe(pg, tmp, &list, lru_head) {
|
||||
list_del_init(&pg->lru_head);
|
||||
put_pg(sb, pg);
|
||||
}
|
||||
out:
|
||||
return min_t(unsigned long, cinf->lru_pages, INT_MAX);
|
||||
}
|
||||
|
||||
@@ -65,7 +65,7 @@
|
||||
* relative to that lock state we resend.
|
||||
*/
|
||||
|
||||
#define GRACE_PERIOD_KT ms_to_ktime(10)
|
||||
#define GRACE_PERIOD_KT ms_to_ktime(2)
|
||||
|
||||
/*
|
||||
* allocated per-super, freed on unmount.
|
||||
@@ -770,6 +770,16 @@ static void lock_invalidate_worker(struct work_struct *work)
|
||||
list_for_each_entry_safe(lock, tmp, &linfo->inv_list, inv_head) {
|
||||
nl = &lock->inv_nl;
|
||||
|
||||
/* skip if grace hasn't elapsed, record earliest */
|
||||
deadline = lock->grace_deadline;
|
||||
if (ktime_before(now, deadline)) {
|
||||
delay = min(delay,
|
||||
nsecs_to_jiffies(ktime_to_ns(
|
||||
ktime_sub(deadline, now))));
|
||||
scoutfs_inc_counter(linfo->sb, lock_grace_wait);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* wait for reordered grant to finish */
|
||||
if (lock->mode != nl->old_mode)
|
||||
continue;
|
||||
@@ -778,15 +788,6 @@ static void lock_invalidate_worker(struct work_struct *work)
|
||||
if (!lock_counts_match(nl->new_mode, lock->users))
|
||||
continue;
|
||||
|
||||
/* skip if grace hasn't elapsed, record earliest */
|
||||
deadline = lock->grace_deadline;
|
||||
if (!linfo->shutdown && ktime_before(now, deadline)) {
|
||||
delay = min(delay,
|
||||
nsecs_to_jiffies(ktime_to_ns(
|
||||
ktime_sub(deadline, now))));
|
||||
scoutfs_inc_counter(linfo->sb, lock_grace_wait);
|
||||
continue;
|
||||
}
|
||||
/* set the new mode, no incompatible users during inval */
|
||||
lock->mode = nl->new_mode;
|
||||
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
#include "lock.h"
|
||||
#include "super.h"
|
||||
#include "ioctl.h"
|
||||
#include "count.h"
|
||||
#include "export.h"
|
||||
#include "dir.h"
|
||||
#include "server.h"
|
||||
@@ -425,59 +426,133 @@ TRACE_EVENT(scoutfs_trans_write_func,
|
||||
|
||||
TRACE_EVENT(scoutfs_release_trans,
|
||||
TP_PROTO(struct super_block *sb, void *rsv, unsigned int rsv_holders,
|
||||
unsigned int tri_holders,
|
||||
unsigned int tri_writing),
|
||||
struct scoutfs_item_count *res,
|
||||
struct scoutfs_item_count *act, unsigned int tri_holders,
|
||||
unsigned int tri_writing, unsigned int tri_items,
|
||||
unsigned int tri_vals),
|
||||
|
||||
TP_ARGS(sb, rsv, rsv_holders, tri_holders, tri_writing),
|
||||
TP_ARGS(sb, rsv, rsv_holders, res, act, tri_holders, tri_writing,
|
||||
tri_items, tri_vals),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(void *, rsv)
|
||||
__field(unsigned int, rsv_holders)
|
||||
__field(int, res_items)
|
||||
__field(int, res_vals)
|
||||
__field(int, act_items)
|
||||
__field(int, act_vals)
|
||||
__field(unsigned int, tri_holders)
|
||||
__field(unsigned int, tri_writing)
|
||||
__field(unsigned int, tri_items)
|
||||
__field(unsigned int, tri_vals)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->rsv = rsv;
|
||||
__entry->rsv_holders = rsv_holders;
|
||||
__entry->res_items = res->items;
|
||||
__entry->res_vals = res->vals;
|
||||
__entry->act_items = act->items;
|
||||
__entry->act_vals = act->vals;
|
||||
__entry->tri_holders = tri_holders;
|
||||
__entry->tri_writing = tri_writing;
|
||||
__entry->tri_items = tri_items;
|
||||
__entry->tri_vals = tri_vals;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" rsv %p holders %u trans holders %u writing %u",
|
||||
SCSB_TRACE_ARGS, __entry->rsv, __entry->rsv_holders,
|
||||
__entry->tri_holders, __entry->tri_writing)
|
||||
TP_printk(SCSBF" rsv %p holders %u reserved %u.%u actual "
|
||||
"%d.%d, trans holders %u writing %u reserved "
|
||||
"%u.%u", SCSB_TRACE_ARGS, __entry->rsv, __entry->rsv_holders,
|
||||
__entry->res_items, __entry->res_vals, __entry->act_items,
|
||||
__entry->act_vals, __entry->tri_holders, __entry->tri_writing,
|
||||
__entry->tri_items, __entry->tri_vals)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_trans_acquired_hold,
|
||||
TP_PROTO(struct super_block *sb,
|
||||
TP_PROTO(struct super_block *sb, const struct scoutfs_item_count *cnt,
|
||||
void *rsv, unsigned int rsv_holders,
|
||||
unsigned int tri_holders,
|
||||
unsigned int tri_writing),
|
||||
struct scoutfs_item_count *res,
|
||||
struct scoutfs_item_count *act, unsigned int tri_holders,
|
||||
unsigned int tri_writing, unsigned int tri_items,
|
||||
unsigned int tri_vals),
|
||||
|
||||
TP_ARGS(sb, rsv, rsv_holders, tri_holders, tri_writing),
|
||||
TP_ARGS(sb, cnt, rsv, rsv_holders, res, act, tri_holders, tri_writing,
|
||||
tri_items, tri_vals),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(int, cnt_items)
|
||||
__field(int, cnt_vals)
|
||||
__field(void *, rsv)
|
||||
__field(unsigned int, rsv_holders)
|
||||
__field(int, res_items)
|
||||
__field(int, res_vals)
|
||||
__field(int, act_items)
|
||||
__field(int, act_vals)
|
||||
__field(unsigned int, tri_holders)
|
||||
__field(unsigned int, tri_writing)
|
||||
__field(unsigned int, tri_items)
|
||||
__field(unsigned int, tri_vals)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->cnt_items = cnt->items;
|
||||
__entry->cnt_vals = cnt->vals;
|
||||
__entry->rsv = rsv;
|
||||
__entry->rsv_holders = rsv_holders;
|
||||
__entry->res_items = res->items;
|
||||
__entry->res_vals = res->vals;
|
||||
__entry->act_items = act->items;
|
||||
__entry->act_vals = act->vals;
|
||||
__entry->tri_holders = tri_holders;
|
||||
__entry->tri_writing = tri_writing;
|
||||
__entry->tri_items = tri_items;
|
||||
__entry->tri_vals = tri_vals;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" rsv %p holders %u trans holders %u writing %u",
|
||||
SCSB_TRACE_ARGS, __entry->rsv, __entry->rsv_holders,
|
||||
__entry->tri_holders, __entry->tri_writing)
|
||||
TP_printk(SCSBF" cnt %u.%u, rsv %p holders %u reserved %u.%u "
|
||||
"actual %d.%d, trans holders %u writing %u reserved "
|
||||
"%u.%u", SCSB_TRACE_ARGS, __entry->cnt_items,
|
||||
__entry->cnt_vals, __entry->rsv, __entry->rsv_holders,
|
||||
__entry->res_items, __entry->res_vals, __entry->act_items,
|
||||
__entry->act_vals, __entry->tri_holders, __entry->tri_writing,
|
||||
__entry->tri_items, __entry->tri_vals)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_trans_track_item,
|
||||
TP_PROTO(struct super_block *sb, int delta_items, int delta_vals,
|
||||
int act_items, int act_vals, int res_items, int res_vals),
|
||||
|
||||
TP_ARGS(sb, delta_items, delta_vals, act_items, act_vals, res_items,
|
||||
res_vals),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(int, delta_items)
|
||||
__field(int, delta_vals)
|
||||
__field(int, act_items)
|
||||
__field(int, act_vals)
|
||||
__field(int, res_items)
|
||||
__field(int, res_vals)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->delta_items = delta_items;
|
||||
__entry->delta_vals = delta_vals;
|
||||
__entry->act_items = act_items;
|
||||
__entry->act_vals = act_vals;
|
||||
__entry->res_items = res_items;
|
||||
__entry->res_vals = res_vals;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" delta_items %d delta_vals %d act_items %d act_vals %d res_items %d res_vals %d",
|
||||
SCSB_TRACE_ARGS, __entry->delta_items, __entry->delta_vals,
|
||||
__entry->act_items, __entry->act_vals, __entry->res_items,
|
||||
__entry->res_vals)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_ioc_release,
|
||||
@@ -1938,27 +2013,31 @@ DEFINE_EVENT(scoutfs_clock_sync_class, scoutfs_recv_clock_sync,
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_trans_seq_advance,
|
||||
TP_PROTO(struct super_block *sb, u64 rid, u64 trans_seq),
|
||||
TP_PROTO(struct super_block *sb, u64 rid, u64 prev_seq,
|
||||
u64 next_seq),
|
||||
|
||||
TP_ARGS(sb, rid, trans_seq),
|
||||
TP_ARGS(sb, rid, prev_seq, next_seq),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, s_rid)
|
||||
__field(__u64, trans_seq)
|
||||
__field(__u64, prev_seq)
|
||||
__field(__u64, next_seq)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->s_rid = rid;
|
||||
__entry->trans_seq = trans_seq;
|
||||
__entry->prev_seq = prev_seq;
|
||||
__entry->next_seq = next_seq;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" rid %016llx trans_seq %llu\n",
|
||||
SCSB_TRACE_ARGS, __entry->s_rid, __entry->trans_seq)
|
||||
TP_printk(SCSBF" rid %016llx prev_seq %llu next_seq %llu",
|
||||
SCSB_TRACE_ARGS, __entry->s_rid, __entry->prev_seq,
|
||||
__entry->next_seq)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_trans_seq_remove,
|
||||
TRACE_EVENT(scoutfs_trans_seq_farewell,
|
||||
TP_PROTO(struct super_block *sb, u64 rid, u64 trans_seq),
|
||||
|
||||
TP_ARGS(sb, rid, trans_seq),
|
||||
@@ -2417,53 +2496,6 @@ TRACE_EVENT(scoutfs_alloc_move,
|
||||
__entry->ret)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_item_read_page,
|
||||
TP_PROTO(struct super_block *sb, struct scoutfs_key *key,
|
||||
struct scoutfs_key *pg_start, struct scoutfs_key *pg_end),
|
||||
TP_ARGS(sb, key, pg_start, pg_end),
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
sk_trace_define(key)
|
||||
sk_trace_define(pg_start)
|
||||
sk_trace_define(pg_end)
|
||||
),
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
sk_trace_assign(key, key);
|
||||
sk_trace_assign(pg_start, pg_start);
|
||||
sk_trace_assign(pg_end, pg_end);
|
||||
),
|
||||
TP_printk(SCSBF" key "SK_FMT" pg_start "SK_FMT" pg_end "SK_FMT,
|
||||
SCSB_TRACE_ARGS, sk_trace_args(key), sk_trace_args(pg_start),
|
||||
sk_trace_args(pg_end))
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_item_invalidate_page,
|
||||
TP_PROTO(struct super_block *sb, struct scoutfs_key *start,
|
||||
struct scoutfs_key *end, struct scoutfs_key *pg_start,
|
||||
struct scoutfs_key *pg_end, int pgi),
|
||||
TP_ARGS(sb, start, end, pg_start, pg_end, pgi),
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
sk_trace_define(start)
|
||||
sk_trace_define(end)
|
||||
sk_trace_define(pg_start)
|
||||
sk_trace_define(pg_end)
|
||||
__field(int, pgi)
|
||||
),
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
sk_trace_assign(start, start);
|
||||
sk_trace_assign(end, end);
|
||||
sk_trace_assign(pg_start, pg_start);
|
||||
sk_trace_assign(pg_end, pg_end);
|
||||
__entry->pgi = pgi;
|
||||
),
|
||||
TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" pg_start "SK_FMT" pg_end "SK_FMT" pgi %d",
|
||||
SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end),
|
||||
sk_trace_args(pg_start), sk_trace_args(pg_end), __entry->pgi)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_SCOUTFS_H */
|
||||
|
||||
/* This part must be outside protection */
|
||||
|
||||
@@ -649,10 +649,79 @@ static void init_trans_seq_key(struct scoutfs_key *key, u64 seq, u64 rid)
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove all trans_seq items owned by the client rid, the caller holds
|
||||
* the seq_rwsem.
|
||||
* Give the client the next sequence number for their transaction. They
|
||||
* provide their previous transaction sequence number that they've
|
||||
* committed.
|
||||
*
|
||||
* We track the sequence numbers of transactions that clients have open.
|
||||
* This limits the transaction sequence numbers that can be returned in
|
||||
* the index of inodes by meta and data transaction numbers. We
|
||||
* communicate the largest possible sequence number to clients via an
|
||||
* rpc.
|
||||
*
|
||||
* The transaction sequence tracking is stored in a btree so it is
|
||||
* shared across servers. Final entries are removed when processing a
|
||||
* client's farewell or when it's removed.
|
||||
*/
|
||||
static int remove_trans_seq_locked(struct super_block *sb, u64 rid)
|
||||
static int server_advance_seq(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id, void *arg, u16 arg_len)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
__le64 their_seq;
|
||||
__le64 next_seq;
|
||||
u64 rid = scoutfs_net_client_rid(conn);
|
||||
struct scoutfs_key key;
|
||||
int ret;
|
||||
|
||||
if (arg_len != sizeof(__le64)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
memcpy(&their_seq, arg, sizeof(their_seq));
|
||||
|
||||
ret = scoutfs_server_hold_commit(sb);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
down_write(&server->seq_rwsem);
|
||||
|
||||
if (their_seq != 0) {
|
||||
init_trans_seq_key(&key, le64_to_cpu(their_seq), rid);
|
||||
ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri,
|
||||
&super->trans_seqs, &key);
|
||||
if (ret < 0 && ret != -ENOENT)
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
next_seq = super->next_trans_seq;
|
||||
le64_add_cpu(&super->next_trans_seq, 1);
|
||||
|
||||
trace_scoutfs_trans_seq_advance(sb, rid, le64_to_cpu(their_seq),
|
||||
le64_to_cpu(next_seq));
|
||||
|
||||
init_trans_seq_key(&key, le64_to_cpu(next_seq), rid);
|
||||
ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
|
||||
&super->trans_seqs, &key, NULL, 0);
|
||||
unlock:
|
||||
up_write(&server->seq_rwsem);
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
|
||||
out:
|
||||
return scoutfs_net_response(sb, conn, cmd, id, ret,
|
||||
&next_seq, sizeof(next_seq));
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove any transaction sequences owned by the client. They must have
|
||||
* committed any final transaction by the time they get here via sending
|
||||
* their farewell message. This can be called multiple times as the
|
||||
* client's farewell is retransmitted so it's OK to not find any
|
||||
* entries. This is called with the server commit rwsem held.
|
||||
*/
|
||||
static int remove_trans_seq(struct super_block *sb, u64 rid)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
@@ -661,6 +730,8 @@ static int remove_trans_seq_locked(struct super_block *sb, u64 rid)
|
||||
struct scoutfs_key key;
|
||||
int ret = 0;
|
||||
|
||||
down_write(&server->seq_rwsem);
|
||||
|
||||
init_trans_seq_key(&key, 0, 0);
|
||||
|
||||
for (;;) {
|
||||
@@ -675,102 +746,17 @@ static int remove_trans_seq_locked(struct super_block *sb, u64 rid)
|
||||
scoutfs_btree_put_iref(&iref);
|
||||
|
||||
if (le64_to_cpu(key.skts_rid) == rid) {
|
||||
trace_scoutfs_trans_seq_remove(sb, rid,
|
||||
trace_scoutfs_trans_seq_farewell(sb, rid,
|
||||
le64_to_cpu(key.skts_trans_seq));
|
||||
ret = scoutfs_btree_delete(sb, &server->alloc,
|
||||
&server->wri,
|
||||
&super->trans_seqs, &key);
|
||||
if (ret < 0)
|
||||
break;
|
||||
break;
|
||||
}
|
||||
|
||||
scoutfs_key_inc(&key);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Give the client the next sequence number for the transaction that
|
||||
* they're opening.
|
||||
*
|
||||
* We track the sequence numbers of transactions that clients have open.
|
||||
* This limits the transaction sequence numbers that can be returned in
|
||||
* the index of inodes by meta and data transaction numbers. We
|
||||
* communicate the largest possible sequence number to clients via an
|
||||
* rpc.
|
||||
*
|
||||
* The transaction sequence tracking is stored in a btree so it is
|
||||
* shared across servers. Final entries are removed when processing a
|
||||
* client's farewell or when it's removed. We can be processent a
|
||||
* resent request that was committed by a previous server before the
|
||||
* reply was lost. At this point the client has no transactions open
|
||||
* and may or may not have just finished one. To keep it simple we
|
||||
* always remove any previous seq items, if there are any, and then
|
||||
* insert a new item for the client at the next greatest seq.
|
||||
*/
|
||||
static int server_advance_seq(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id, void *arg, u16 arg_len)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
u64 rid = scoutfs_net_client_rid(conn);
|
||||
struct scoutfs_key key;
|
||||
__le64 leseq = 0;
|
||||
u64 seq;
|
||||
int ret;
|
||||
|
||||
if (arg_len != 0) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_server_hold_commit(sb);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
down_write(&server->seq_rwsem);
|
||||
|
||||
ret = remove_trans_seq_locked(sb, rid);
|
||||
if (ret < 0)
|
||||
goto unlock;
|
||||
|
||||
seq = le64_to_cpu(super->next_trans_seq);
|
||||
le64_add_cpu(&super->next_trans_seq, 1);
|
||||
|
||||
trace_scoutfs_trans_seq_advance(sb, rid, seq);
|
||||
|
||||
init_trans_seq_key(&key, seq, rid);
|
||||
ret = scoutfs_btree_insert(sb, &server->alloc, &server->wri,
|
||||
&super->trans_seqs, &key, NULL, 0);
|
||||
if (ret == 0)
|
||||
leseq = cpu_to_le64(seq);
|
||||
unlock:
|
||||
up_write(&server->seq_rwsem);
|
||||
ret = scoutfs_server_apply_commit(sb, ret);
|
||||
|
||||
out:
|
||||
return scoutfs_net_response(sb, conn, cmd, id, ret,
|
||||
&leseq, sizeof(leseq));
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove any transaction sequences owned by the client who's sent a
|
||||
* farewell They must have committed any final transaction by the time
|
||||
* they get here via sending their farewell message. This can be called
|
||||
* multiple times as the client's farewell is retransmitted so it's OK
|
||||
* to not find any entries. This is called with the server commit rwsem
|
||||
* held.
|
||||
*/
|
||||
static int remove_trans_seq(struct super_block *sb, u64 rid)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
int ret = 0;
|
||||
|
||||
down_write(&server->seq_rwsem);
|
||||
ret = remove_trans_seq_locked(sb, rid);
|
||||
up_write(&server->seq_rwsem);
|
||||
|
||||
return ret;
|
||||
@@ -1110,20 +1096,6 @@ static int cancel_srch_compact(struct super_block *sb, u64 rid)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Farewell processing is async to the request processing work. Shutdown
|
||||
* waits for request processing to finish and then tears down the connection.
|
||||
* We don't want to queue farewell processing once we start shutting down
|
||||
* so that we don't have farewell processing racing with the connecting
|
||||
* being shutdown. If a mount's farewell message is dropped by a server
|
||||
* it will be processed by the next server.
|
||||
*/
|
||||
static void queue_farewell_work(struct server_info *server)
|
||||
{
|
||||
if (!server->shutting_down)
|
||||
queue_work(server->wq, &server->farewell_work);
|
||||
}
|
||||
|
||||
/*
|
||||
* Process an incoming greeting request in the server from the client.
|
||||
* We try to send responses to failed greetings so that the sender can
|
||||
@@ -1169,10 +1141,10 @@ static int server_greeting(struct super_block *sb,
|
||||
goto send_err;
|
||||
}
|
||||
|
||||
if (gr->version != super->version) {
|
||||
if (gr->format_hash != super->format_hash) {
|
||||
scoutfs_warn(sb, "client sent format 0x%llx, server has 0x%llx",
|
||||
le64_to_cpu(gr->version),
|
||||
le64_to_cpu(super->version));
|
||||
le64_to_cpu(gr->format_hash),
|
||||
le64_to_cpu(super->format_hash));
|
||||
ret = -EINVAL;
|
||||
goto send_err;
|
||||
}
|
||||
@@ -1201,7 +1173,7 @@ send_err:
|
||||
err = ret;
|
||||
|
||||
greet.fsid = super->hdr.fsid;
|
||||
greet.version = super->version;
|
||||
greet.format_hash = super->format_hash;
|
||||
greet.server_term = cpu_to_le64(server->term);
|
||||
greet.unmount_barrier = umb;
|
||||
greet.rid = gr->rid;
|
||||
@@ -1428,8 +1400,8 @@ out:
|
||||
|
||||
if (ret < 0)
|
||||
stop_server(server);
|
||||
else if (more_reqs)
|
||||
queue_farewell_work(server);
|
||||
else if (more_reqs && !server->shutting_down)
|
||||
queue_work(server->wq, &server->farewell_work);
|
||||
}
|
||||
|
||||
static void free_farewell_requests(struct super_block *sb, u64 rid)
|
||||
@@ -1483,7 +1455,7 @@ static int server_farewell(struct super_block *sb,
|
||||
list_add_tail(&fw->entry, &server->farewell_requests);
|
||||
mutex_unlock(&server->farewell_mutex);
|
||||
|
||||
queue_farewell_work(server);
|
||||
queue_work(server->wq, &server->farewell_work);
|
||||
|
||||
/* response will be sent later */
|
||||
return 0;
|
||||
@@ -1646,16 +1618,11 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
|
||||
shutdown:
|
||||
scoutfs_info(sb, "server shutting down at "SIN_FMT, SIN_ARG(&sin));
|
||||
|
||||
/* wait for farewell to finish sending messages */
|
||||
flush_work(&server->farewell_work);
|
||||
|
||||
/* wait for requests to finish, no more requests */
|
||||
/* wait for request processing */
|
||||
scoutfs_net_shutdown(sb, conn);
|
||||
server->conn = NULL;
|
||||
|
||||
/* wait for extra queues by requests, won't find waiters */
|
||||
/* wait for commit queued by request processing */
|
||||
flush_work(&server->commit_work);
|
||||
server->conn = NULL;
|
||||
|
||||
scoutfs_lock_server_destroy(sb);
|
||||
|
||||
@@ -1729,9 +1696,8 @@ void scoutfs_server_stop(struct super_block *sb)
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
stop_server(server);
|
||||
|
||||
/* XXX not sure both are needed */
|
||||
cancel_work_sync(&server->work);
|
||||
cancel_work_sync(&server->farewell_work);
|
||||
cancel_work_sync(&server->commit_work);
|
||||
}
|
||||
|
||||
@@ -1786,12 +1752,11 @@ void scoutfs_server_destroy(struct super_block *sb)
|
||||
|
||||
/* wait for server work to wait for everything to shut down */
|
||||
cancel_work_sync(&server->work);
|
||||
/* farewell work triggers commits */
|
||||
cancel_work_sync(&server->farewell_work);
|
||||
/* recv work/compaction could have left commit_work queued */
|
||||
cancel_work_sync(&server->commit_work);
|
||||
|
||||
/* pending farewell requests are another server's problem */
|
||||
cancel_work_sync(&server->farewell_work);
|
||||
free_farewell_requests(sb, 0);
|
||||
|
||||
trace_scoutfs_server_workqueue_destroy(sb, 0, 0);
|
||||
|
||||
@@ -1198,10 +1198,14 @@ int scoutfs_srch_get_compact(struct super_block *sb,
|
||||
|
||||
for (;;scoutfs_key_inc(&key)) {
|
||||
ret = scoutfs_btree_next(sb, root, &key, &iref);
|
||||
if (ret == -ENOENT) {
|
||||
ret = 0;
|
||||
sc->nr = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (ret == 0) {
|
||||
if (iref.key->sk_type != type) {
|
||||
ret = -ENOENT;
|
||||
} else if (iref.val_len == sizeof(sfl)) {
|
||||
if (iref.val_len == sizeof(struct scoutfs_srch_file)) {
|
||||
key = *iref.key;
|
||||
memcpy(&sfl, iref.val, iref.val_len);
|
||||
} else {
|
||||
@@ -1209,25 +1213,24 @@ int scoutfs_srch_get_compact(struct super_block *sb,
|
||||
}
|
||||
scoutfs_btree_put_iref(&iref);
|
||||
}
|
||||
if (ret < 0) {
|
||||
/* see if we ran out of log files or files entirely */
|
||||
if (ret == -ENOENT) {
|
||||
sc->nr = 0;
|
||||
if (type == SCOUTFS_SRCH_LOG_TYPE) {
|
||||
type = SCOUTFS_SRCH_BLOCKS_TYPE;
|
||||
init_srch_key(&key, type, 0, 0);
|
||||
continue;
|
||||
} else {
|
||||
ret = 0;
|
||||
}
|
||||
}
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* skip any files already being compacted */
|
||||
if (scoutfs_spbm_test(&busy, le64_to_cpu(sfl.ref.blkno)))
|
||||
continue;
|
||||
|
||||
/* see if we ran out of log files or files entirely */
|
||||
if (key.sk_type != type) {
|
||||
sc->nr = 0;
|
||||
if (key.sk_type == SCOUTFS_SRCH_BLOCKS_TYPE) {
|
||||
type = SCOUTFS_SRCH_BLOCKS_TYPE;
|
||||
} else {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* reset if we iterated into the next size category */
|
||||
if (type == SCOUTFS_SRCH_BLOCKS_TYPE) {
|
||||
order = fls64(le64_to_cpu(sfl.blocks)) /
|
||||
|
||||
@@ -352,10 +352,10 @@ static int scoutfs_read_super_from_bdev(struct super_block *sb,
|
||||
}
|
||||
|
||||
|
||||
if (super->version != cpu_to_le64(SCOUTFS_INTEROP_VERSION)) {
|
||||
scoutfs_err(sb, "super block has invalid version %llu, expected %llu",
|
||||
le64_to_cpu(super->version),
|
||||
SCOUTFS_INTEROP_VERSION);
|
||||
if (super->format_hash != cpu_to_le64(SCOUTFS_FORMAT_HASH)) {
|
||||
scoutfs_err(sb, "super block has invalid format hash 0x%llx, expected 0x%llx",
|
||||
le64_to_cpu(super->format_hash),
|
||||
SCOUTFS_FORMAT_HASH);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
@@ -682,10 +682,6 @@ static int __init scoutfs_module_init(void)
|
||||
".section .note.git_describe,\"a\"\n"
|
||||
".string \""SCOUTFS_GIT_DESCRIBE"\\n\"\n"
|
||||
".previous\n");
|
||||
__asm__ __volatile__ (
|
||||
".section .note.scoutfs_interop_version,\"a\"\n"
|
||||
".string \""SCOUTFS_INTEROP_VERSION_STR"\\n\"\n"
|
||||
".previous\n");
|
||||
|
||||
scoutfs_init_counters();
|
||||
|
||||
@@ -718,4 +714,3 @@ module_exit(scoutfs_module_exit)
|
||||
MODULE_AUTHOR("Zach Brown <zab@versity.com>");
|
||||
MODULE_LICENSE("GPL");
|
||||
MODULE_INFO(git_describe, SCOUTFS_GIT_DESCRIBE);
|
||||
MODULE_INFO(scoutfs_interop_version, SCOUTFS_INTEROP_VERSION_STR);
|
||||
|
||||
@@ -60,6 +60,8 @@
|
||||
*/
|
||||
struct trans_info {
|
||||
spinlock_t lock;
|
||||
unsigned reserved_items;
|
||||
unsigned reserved_vals;
|
||||
unsigned holders;
|
||||
bool writing;
|
||||
|
||||
@@ -316,11 +318,12 @@ void scoutfs_trans_restart_sync_deadline(struct super_block *sb)
|
||||
* Including nested holds avoids having to deal with writing out partial
|
||||
* transactions while a caller still holds the transaction.
|
||||
*/
|
||||
|
||||
#define SCOUTFS_RESERVATION_MAGIC 0xd57cd13b
|
||||
struct scoutfs_reservation {
|
||||
unsigned magic;
|
||||
unsigned holders;
|
||||
struct scoutfs_item_count reserved;
|
||||
struct scoutfs_item_count actual;
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -337,16 +340,22 @@ struct scoutfs_reservation {
|
||||
* delaying or prematurely forcing commits.
|
||||
*/
|
||||
static bool acquired_hold(struct super_block *sb,
|
||||
struct scoutfs_reservation *rsv)
|
||||
struct scoutfs_reservation *rsv,
|
||||
const struct scoutfs_item_count *cnt)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
DECLARE_TRANS_INFO(sb, tri);
|
||||
bool acquired = false;
|
||||
unsigned items;
|
||||
unsigned vals;
|
||||
|
||||
spin_lock(&tri->lock);
|
||||
|
||||
trace_scoutfs_trans_acquired_hold(sb, rsv, rsv->holders,
|
||||
tri->holders, tri->writing);
|
||||
trace_scoutfs_trans_acquired_hold(sb, cnt, rsv, rsv->holders,
|
||||
&rsv->reserved, &rsv->actual,
|
||||
tri->holders, tri->writing,
|
||||
tri->reserved_items,
|
||||
tri->reserved_vals);
|
||||
|
||||
/* use a caller's existing reservation */
|
||||
if (rsv->holders)
|
||||
@@ -356,6 +365,10 @@ static bool acquired_hold(struct super_block *sb,
|
||||
if (tri->writing)
|
||||
goto out;
|
||||
|
||||
/* see if we can reserve space for our item count */
|
||||
items = tri->reserved_items + cnt->items;
|
||||
vals = tri->reserved_vals + cnt->vals;
|
||||
|
||||
/*
|
||||
* In theory each dirty item page could be straddling two full
|
||||
* blocks, requiring 4 allocations for each item cache page.
|
||||
@@ -392,6 +405,12 @@ static bool acquired_hold(struct super_block *sb,
|
||||
goto out;
|
||||
}
|
||||
|
||||
tri->reserved_items = items;
|
||||
tri->reserved_vals = vals;
|
||||
|
||||
rsv->reserved.items = cnt->items;
|
||||
rsv->reserved.vals = cnt->vals;
|
||||
|
||||
hold:
|
||||
rsv->holders++;
|
||||
tri->holders++;
|
||||
@@ -404,12 +423,20 @@ out:
|
||||
return acquired;
|
||||
}
|
||||
|
||||
int scoutfs_hold_trans(struct super_block *sb)
|
||||
int scoutfs_hold_trans(struct super_block *sb,
|
||||
const struct scoutfs_item_count cnt)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_reservation *rsv;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Caller shouldn't provide garbage counts, nor counts that
|
||||
* can't fit in segments by themselves.
|
||||
*/
|
||||
if (WARN_ON_ONCE(cnt.items <= 0 || cnt.vals < 0))
|
||||
return -EINVAL;
|
||||
|
||||
if (current == sbi->trans_task)
|
||||
return 0;
|
||||
|
||||
@@ -426,7 +453,7 @@ int scoutfs_hold_trans(struct super_block *sb)
|
||||
BUG_ON(rsv->magic != SCOUTFS_RESERVATION_MAGIC);
|
||||
|
||||
ret = wait_event_interruptible(sbi->trans_hold_wq,
|
||||
acquired_hold(sb, rsv));
|
||||
acquired_hold(sb, rsv, &cnt));
|
||||
if (ret && rsv->holders == 0) {
|
||||
current->journal_info = NULL;
|
||||
kfree(rsv);
|
||||
@@ -446,6 +473,38 @@ bool scoutfs_trans_held(void)
|
||||
return rsv && rsv->magic == SCOUTFS_RESERVATION_MAGIC;
|
||||
}
|
||||
|
||||
/*
|
||||
* Record a transaction holder's individual contribution to the dirty
|
||||
* items in the current transaction. We're making sure that the
|
||||
* reservation matches the possible item manipulations while they hold
|
||||
* the reservation.
|
||||
*
|
||||
* It is possible and legitimate for an individual contribution to be
|
||||
* negative if they delete dirty items. The item cache makes sure that
|
||||
* the total dirty item count doesn't fall below zero.
|
||||
*/
|
||||
void scoutfs_trans_track_item(struct super_block *sb, signed items,
|
||||
signed vals)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_reservation *rsv = current->journal_info;
|
||||
|
||||
if (current == sbi->trans_task)
|
||||
return;
|
||||
|
||||
BUG_ON(!rsv || rsv->magic != SCOUTFS_RESERVATION_MAGIC);
|
||||
|
||||
rsv->actual.items += items;
|
||||
rsv->actual.vals += vals;
|
||||
|
||||
trace_scoutfs_trans_track_item(sb, items, vals, rsv->actual.items,
|
||||
rsv->actual.vals, rsv->reserved.items,
|
||||
rsv->reserved.vals);
|
||||
|
||||
WARN_ON_ONCE(rsv->actual.items > rsv->reserved.items);
|
||||
WARN_ON_ONCE(rsv->actual.vals > rsv->reserved.vals);
|
||||
}
|
||||
|
||||
/*
|
||||
* As we drop the last hold in the reservation we try and wake other
|
||||
* hold attempts that were waiting for space. As we drop the last trans
|
||||
@@ -467,12 +526,16 @@ void scoutfs_release_trans(struct super_block *sb)
|
||||
|
||||
spin_lock(&tri->lock);
|
||||
|
||||
trace_scoutfs_release_trans(sb, rsv, rsv->holders, tri->holders, tri->writing);
|
||||
trace_scoutfs_release_trans(sb, rsv, rsv->holders, &rsv->reserved,
|
||||
&rsv->actual, tri->holders, tri->writing,
|
||||
tri->reserved_items, tri->reserved_vals);
|
||||
|
||||
BUG_ON(rsv->holders <= 0);
|
||||
BUG_ON(tri->holders <= 0);
|
||||
|
||||
if (--rsv->holders == 0) {
|
||||
tri->reserved_items -= rsv->reserved.items;
|
||||
tri->reserved_vals -= rsv->reserved.vals;
|
||||
current->journal_info = NULL;
|
||||
kfree(rsv);
|
||||
wake = true;
|
||||
|
||||
@@ -6,16 +6,21 @@
|
||||
/* the client will force commits if data allocators get too low */
|
||||
#define SCOUTFS_TRANS_DATA_ALLOC_LWM (256ULL * 1024 * 1024)
|
||||
|
||||
#include "count.h"
|
||||
|
||||
void scoutfs_trans_write_func(struct work_struct *work);
|
||||
int scoutfs_trans_sync(struct super_block *sb, int wait);
|
||||
int scoutfs_file_fsync(struct file *file, loff_t start, loff_t end,
|
||||
int datasync);
|
||||
void scoutfs_trans_restart_sync_deadline(struct super_block *sb);
|
||||
|
||||
int scoutfs_hold_trans(struct super_block *sb);
|
||||
int scoutfs_hold_trans(struct super_block *sb,
|
||||
const struct scoutfs_item_count cnt);
|
||||
bool scoutfs_trans_held(void);
|
||||
void scoutfs_release_trans(struct super_block *sb);
|
||||
u64 scoutfs_trans_sample_seq(struct super_block *sb);
|
||||
void scoutfs_trans_track_item(struct super_block *sb, signed items,
|
||||
signed vals);
|
||||
|
||||
int scoutfs_trans_get_log_trees(struct super_block *sb);
|
||||
bool scoutfs_trans_has_dirty(struct super_block *sb);
|
||||
|
||||
@@ -577,7 +577,10 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name,
|
||||
retry:
|
||||
ret = scoutfs_inode_index_start(sb, &ind_seq) ?:
|
||||
scoutfs_inode_index_prepare(sb, &ind_locks, inode, false) ?:
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq);
|
||||
scoutfs_inode_index_try_lock_hold(sb, &ind_locks, ind_seq,
|
||||
SIC_XATTR_SET(found_parts,
|
||||
value != NULL,
|
||||
name_len, size));
|
||||
if (ret > 0)
|
||||
goto retry;
|
||||
if (ret)
|
||||
@@ -778,7 +781,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
|
||||
&tgs) != 0)
|
||||
memset(&tgs, 0, sizeof(tgs));
|
||||
|
||||
ret = scoutfs_hold_trans(sb);
|
||||
ret = scoutfs_hold_trans(sb, SIC_EXACT(2, 0));
|
||||
if (ret < 0)
|
||||
break;
|
||||
release = true;
|
||||
|
||||
@@ -59,8 +59,5 @@ t_filter_dmesg()
|
||||
# some tests mount w/o options
|
||||
re="$re|scoutfs .* error: Required mount option \"metadev_path\" not found"
|
||||
|
||||
# in debugging kernels we can slow things down a bit
|
||||
re="$re|hrtimer: interrupt took .*"
|
||||
|
||||
egrep -v "($re)"
|
||||
}
|
||||
|
||||
@@ -23,18 +23,3 @@ t_require_mounts() {
|
||||
test "$T_NR_MOUNTS" -ge "$req" || \
|
||||
t_skip "$req mounts required, only have $T_NR_MOUNTS"
|
||||
}
|
||||
|
||||
#
|
||||
# Require that the meta device be at least the size string argument, as
|
||||
# parsed by numfmt using single char base 2 suffixes (iec).. 64G, etc.
|
||||
#
|
||||
t_require_meta_size() {
|
||||
local dev="$T_META_DEVICE"
|
||||
local req_iec="$1"
|
||||
local req_bytes=$(numfmt --from=iec --to=none $req_iec)
|
||||
local dev_bytes=$(blockdev --getsize64 $dev)
|
||||
local dev_iec=$(numfmt --from=auto --to=iec $dev_bytes)
|
||||
|
||||
test "$dev_bytes" -ge "$req_bytes" || \
|
||||
t_skip "$dev must be at least $req_iec, is $dev_iec"
|
||||
}
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
== create per mount files
|
||||
== 30s of racing random mount/umount
|
||||
== mounting any unmounted
|
||||
@@ -53,7 +53,7 @@ $(basename $0) options:
|
||||
-m | Run mkfs on the device before mounting and running
|
||||
| tests. Implies unmounting existing mounts first.
|
||||
-n | The number of devices and mounts to test.
|
||||
-P | Enable trace_printk.
|
||||
-P | Output trace events with printk as they're generated.
|
||||
-p | Exit script after preparing mounts only, don't run tests.
|
||||
-q <nr> | Specify the quorum count needed to mount. This is
|
||||
| used when running mkfs and is needed by a few tests.
|
||||
@@ -62,7 +62,6 @@ $(basename $0) options:
|
||||
| exist. Previous results will be deleted as each test runs.
|
||||
-s | Skip git repo checkouts.
|
||||
-t | Enabled trace events that match the given glob argument.
|
||||
| Multiple options enable multiple globbed events.
|
||||
-X | xfstests git repo. Used by tests/xfstests.sh.
|
||||
-x | xfstests git branch to checkout and track.
|
||||
-y | xfstests ./check additional args
|
||||
@@ -78,9 +77,6 @@ done
|
||||
T_TRACE_DUMP="0"
|
||||
T_TRACE_PRINTK="0"
|
||||
|
||||
# array declarations to be able to use array ops
|
||||
declare -a T_TRACE_GLOB
|
||||
|
||||
while true; do
|
||||
case $1 in
|
||||
-a)
|
||||
@@ -151,7 +147,7 @@ while true; do
|
||||
;;
|
||||
-t)
|
||||
test -n "$2" || die "-t must have trace glob argument"
|
||||
T_TRACE_GLOB+=("$2")
|
||||
T_TRACE_GLOB="$2"
|
||||
shift
|
||||
;;
|
||||
-X)
|
||||
@@ -318,36 +314,22 @@ if [ -n "$T_INSMOD" ]; then
|
||||
cmd insmod "$T_KMOD/src/scoutfs.ko"
|
||||
fi
|
||||
|
||||
nr_globs=${#T_TRACE_GLOB[@]}
|
||||
if [ $nr_globs -gt 0 ]; then
|
||||
if [ -n "$T_TRACE_GLOB" ]; then
|
||||
msg "enabling trace events"
|
||||
echo 0 > /sys/kernel/debug/tracing/events/scoutfs/enable
|
||||
|
||||
for g in "${T_TRACE_GLOB[@]}"; do
|
||||
for g in $T_TRACE_GLOB; do
|
||||
for e in /sys/kernel/debug/tracing/events/scoutfs/$g/enable; do
|
||||
if test -w "$e"; then
|
||||
echo 1 > "$e"
|
||||
else
|
||||
die "-t glob '$g' matched no scoutfs events"
|
||||
fi
|
||||
echo 1 > $e
|
||||
done
|
||||
done
|
||||
|
||||
nr_events=$(cat /sys/kernel/debug/tracing/set_event | wc -l)
|
||||
msg "enabled $nr_events trace events from $nr_globs -t globs"
|
||||
fi
|
||||
|
||||
if [ -n "$T_TRACE_PRINTK" ]; then
|
||||
echo "$T_TRACE_PRINTK" > /sys/kernel/debug/tracing/options/trace_printk
|
||||
fi
|
||||
|
||||
if [ -n "$T_TRACE_DUMP" ]; then
|
||||
echo "$T_TRACE_DUMP" > /proc/sys/kernel/ftrace_dump_on_oops
|
||||
fi
|
||||
echo "$T_TRACE_PRINTK" > /sys/kernel/debug/tracing/options/trace_printk
|
||||
|
||||
# always describe tracing in the logs
|
||||
cmd cat /sys/kernel/debug/tracing/set_event
|
||||
cmd grep . /sys/kernel/debug/tracing/options/trace_printk \
|
||||
/proc/sys/kernel/ftrace_dump_on_oops
|
||||
cmd cat /sys/kernel/debug/tracing/set_event
|
||||
cmd grep . /sys/kernel/debug/tracing/options/trace_printk \
|
||||
/proc/sys/kernel/ftrace_dump_on_oops
|
||||
fi
|
||||
|
||||
#
|
||||
# mount concurrently so that a quorum is present to elect the leader and
|
||||
@@ -452,7 +434,7 @@ for t in $tests; do
|
||||
|
||||
# get stats from previous pass
|
||||
last="$T_RESULTS/last-passed-test-stats"
|
||||
stats=$(grep -s "^$test_name " "$last" | cut -d " " -f 2-)
|
||||
stats=$(grep -s "^$test_name" "$last" | cut -d " " -f 2-)
|
||||
test -n "$stats" && stats="last: $stats"
|
||||
|
||||
printf " %-30s $stats" "$test_name"
|
||||
@@ -515,7 +497,7 @@ for t in $tests; do
|
||||
echo " passed: $stats"
|
||||
((passed++))
|
||||
# save stats for passed test
|
||||
grep -s -v "^$test_name " "$last" > "$last.tmp"
|
||||
grep -s -v "^$test_name" "$last" > "$last.tmp"
|
||||
echo "$test_name $stats" >> "$last.tmp"
|
||||
mv -f "$last.tmp" "$last"
|
||||
elif [ "$sts" == "$T_SKIP_STATUS" ]; then
|
||||
@@ -533,24 +515,24 @@ done
|
||||
|
||||
msg "all tests run: $passed passed, $skipped skipped, $failed failed"
|
||||
|
||||
unmount_all
|
||||
|
||||
if [ -n "$T_TRACE_GLOB" -o -n "$T_TRACE_PRINTK" ]; then
|
||||
if [ -n "$T_TRACE_GLOB" ]; then
|
||||
msg "saving traces and disabling tracing"
|
||||
echo 0 > /sys/kernel/debug/tracing/events/scoutfs/enable
|
||||
echo 0 > /sys/kernel/debug/tracing/options/trace_printk
|
||||
cat /sys/kernel/debug/tracing/trace > "$T_RESULTS/traces"
|
||||
fi
|
||||
|
||||
if [ "$skipped" == 0 -a "$failed" == 0 ]; then
|
||||
status=1
|
||||
if [ "$failed" == 0 ]; then
|
||||
msg "all tests passed"
|
||||
unmount_all
|
||||
exit 0
|
||||
status=0
|
||||
fi
|
||||
|
||||
if [ "$skipped" != 0 ]; then
|
||||
msg "$skipped tests skipped, check skip.log, still mounted"
|
||||
msg "$skipped tests skipped, check skip.log"
|
||||
fi
|
||||
if [ "$failed" != 0 ]; then
|
||||
msg "$failed tests failed, check fail.log, still mounted"
|
||||
msg "$failed tests failed, check fail.log"
|
||||
fi
|
||||
exit 1
|
||||
exit $status
|
||||
|
||||
@@ -25,8 +25,7 @@ lock-conflicting-batch-commit.sh
|
||||
cross-mount-data-free.sh
|
||||
persistent-item-vers.sh
|
||||
setup-error-teardown.sh
|
||||
# failing in jenkins pr runners, zab's working on it
|
||||
#umount-unmount-race.sh
|
||||
mount-unmount-race.sh
|
||||
createmany-parallel-mounts.sh
|
||||
archive-light-cycle.sh
|
||||
stale-btree-read.sh
|
||||
|
||||
@@ -160,8 +160,8 @@ for i in $(seq 1 1); do
|
||||
mkdir -p $(dirname $lnk)
|
||||
ln "$T_D0/file" $lnk
|
||||
|
||||
scoutfs ino-path -p "$T_M0" $ino > "$T_TMP.0"
|
||||
scoutfs ino-path -p "$T_M1" $ino > "$T_TMP.1"
|
||||
scoutfs ino-path $ino "$T_M0" > "$T_TMP.0"
|
||||
scoutfs ino-path $ino "$T_M1" > "$T_TMP.1"
|
||||
diff -u "$T_TMP.0" "$T_TMP.1"
|
||||
done
|
||||
done
|
||||
|
||||
@@ -50,7 +50,7 @@ for m in 0 1; do
|
||||
done
|
||||
wait
|
||||
CONF="$((SECONDS - START))"
|
||||
echo "conf: $CONF" >> $T_TMP.log
|
||||
echo "conf: $IND" >> $T_TMP.log
|
||||
|
||||
if [ "$CONF" -gt "$((IND * 5))" ]; then
|
||||
t_fail "conflicting $CONF secs is more than 5x independent $IND secs"
|
||||
|
||||
@@ -19,10 +19,10 @@
|
||||
|
||||
# make sure we have our config
|
||||
if [ -z "$T_XFSTESTS_REPO" ]; then
|
||||
t_fail "xfstests requires -X repo"
|
||||
t_skip "xfstests requires -X repo"
|
||||
fi
|
||||
if [ -z "$T_XFSTESTS_BRANCH" -a -z "$T_SKIP_CHECKOUT" ]; then
|
||||
t_fail "xfstests requires -x branch"
|
||||
t_skip "xfstests requires -x branch"
|
||||
fi
|
||||
|
||||
t_quiet mkdir -p "$T_TMPDIR/mnt.scratch"
|
||||
@@ -83,7 +83,7 @@ generic/375 # utils output change? update branch?
|
||||
EOF
|
||||
|
||||
t_restore_output
|
||||
echo " (showing output of xfstests)"
|
||||
echo "(showing output of xfstests)"
|
||||
|
||||
args="-E local.exclude ${T_XFSTESTS_ARGS:--g quick}"
|
||||
./check $args
|
||||
|
||||
@@ -1,11 +1,23 @@
|
||||
#
|
||||
# The userspace utils and kernel module share definitions of physical
|
||||
# structures and ioctls. If we're in the repo we include the kmod
|
||||
# headers directly, and hash them directly to calculate the format hash.
|
||||
#
|
||||
# If we're creating a standalone tarball for distribution we copy the
|
||||
# headers out of the kmod dir into the tarball. And then when we're
|
||||
# building in that tarball we use the headers in src/ directly.
|
||||
#
|
||||
FMTIOC_H := format.h ioctl.h
|
||||
FMTIOC_DIST := $(addprefix src/,$(FMTIOC_H))
|
||||
FMTIOC_KMOD := $(addprefix ../kmod/src/,$(FMTIOC_H))
|
||||
|
||||
ifneq ($(wildcard $(firstword $(FMTIOC_KMOD))),)
|
||||
HASH_FILES := $(FMTIOC_KMOD)
|
||||
else
|
||||
HASH_FILES := $(FMTIOC_DIST)
|
||||
endif
|
||||
SCOUTFS_FORMAT_HASH := $(shell cat $(HASH_FILES) | md5sum | cut -b1-16)
|
||||
|
||||
CFLAGS := -Wall -O2 -Werror -D_FILE_OFFSET_BITS=64 -g -msse4.2 \
|
||||
-fno-strict-aliasing \
|
||||
-DSCOUTFS_FORMAT_HASH=0x$(SCOUTFS_FORMAT_HASH)LLU
|
||||
|
||||
@@ -205,7 +205,7 @@ static int do_mkfs(struct mkfs_args *args)
|
||||
pseudo_random_bytes(&super->hdr.fsid, sizeof(super->hdr.fsid));
|
||||
super->hdr.magic = cpu_to_le32(SCOUTFS_BLOCK_MAGIC_SUPER);
|
||||
super->hdr.seq = cpu_to_le64(1);
|
||||
super->version = cpu_to_le64(SCOUTFS_INTEROP_VERSION);
|
||||
super->format_hash = cpu_to_le64(SCOUTFS_FORMAT_HASH);
|
||||
uuid_generate(super->uuid);
|
||||
super->next_ino = cpu_to_le64(SCOUTFS_ROOT_INO + 1);
|
||||
super->next_trans_seq = cpu_to_le64(1);
|
||||
@@ -352,7 +352,7 @@ static int do_mkfs(struct mkfs_args *args)
|
||||
" meta device path: %s\n"
|
||||
" data device path: %s\n"
|
||||
" fsid: %llx\n"
|
||||
" version: %llx\n"
|
||||
" format hash: %llx\n"
|
||||
" uuid: %s\n"
|
||||
" 64KB metadata blocks: "SIZE_FMT"\n"
|
||||
" 4KB data blocks: "SIZE_FMT"\n"
|
||||
@@ -360,7 +360,7 @@ static int do_mkfs(struct mkfs_args *args)
|
||||
args->meta_device,
|
||||
args->data_device,
|
||||
le64_to_cpu(super->hdr.fsid),
|
||||
le64_to_cpu(super->version),
|
||||
le64_to_cpu(super->format_hash),
|
||||
uuid_str,
|
||||
SIZE_ARGS(le64_to_cpu(super->total_meta_blocks),
|
||||
SCOUTFS_BLOCK_LG_SIZE),
|
||||
|
||||
@@ -860,8 +860,8 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
|
||||
|
||||
printf("super blkno %llu\n", blkno);
|
||||
print_block_header(&super->hdr, SCOUTFS_BLOCK_SM_SIZE);
|
||||
printf(" version %llx uuid %s\n",
|
||||
le64_to_cpu(super->version), uuid_str);
|
||||
printf(" format_hash %llx uuid %s\n",
|
||||
le64_to_cpu(super->format_hash), uuid_str);
|
||||
printf(" flags: 0x%016llx\n", le64_to_cpu(super->flags));
|
||||
|
||||
server_addr = alloc_addr_str(&super->server_addr);
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include <assert.h>
|
||||
#include <argp.h>
|
||||
|
||||
#include "sparse.h"
|
||||
@@ -207,6 +208,9 @@ static int do_release(struct release_args *args)
|
||||
return ret;
|
||||
}
|
||||
|
||||
assert(args->offset % SCOUTFS_BLOCK_SM_SIZE == 0);
|
||||
assert(args->length % SCOUTFS_BLOCK_SM_SIZE == 0);
|
||||
|
||||
ioctl_args.offset = args->offset;
|
||||
ioctl_args.length = args->length;
|
||||
ioctl_args.data_version = args->data_version;
|
||||
|
||||
Reference in New Issue
Block a user