Merge pull request #2 from versity/zab/stage_read_zero_block

Zab/stage read zero block
This commit is contained in:
Andy Grover
2020-12-17 16:48:52 -08:00
committed by GitHub
6 changed files with 209 additions and 59 deletions

View File

@@ -93,6 +93,16 @@ static void ext_from_item(struct scoutfs_extent *ext,
ext->flags = dv->flags;
}
static void data_ext_op_warn(struct inode *inode)
{
struct scoutfs_inode_info *si;
if (inode) {
si = SCOUTFS_I(inode);
WARN_ON_ONCE(!rwsem_is_locked(&si->extent_sem));
}
}
static int data_ext_next(struct super_block *sb, void *arg, u64 start, u64 len,
struct scoutfs_extent *ext)
{
@@ -102,6 +112,8 @@ static int data_ext_next(struct super_block *sb, void *arg, u64 start, u64 len,
struct scoutfs_key last;
int ret;
data_ext_op_warn(args->inode);
item_from_extent(&last, &dv, args->ino, U64_MAX, 1, 0, 0);
item_from_extent(&key, &dv, args->ino, start, len, 0, 0);
@@ -139,6 +151,8 @@ static int data_ext_insert(struct super_block *sb, void *arg, u64 start,
struct scoutfs_key key;
int ret;
data_ext_op_warn(args->inode);
item_from_extent(&key, &dv, args->ino, start, len, map, flags);
ret = scoutfs_item_create(sb, &key, &dv, sizeof(dv), args->lock);
if (ret == 0 && args->inode)
@@ -154,6 +168,8 @@ static int data_ext_remove(struct super_block *sb, void *arg, u64 start,
struct scoutfs_key key;
int ret;
data_ext_op_warn(args->inode);
item_from_extent(&key, &dv, args->ino, start, len, map, flags);
ret = scoutfs_item_delete(sb, &key, args->lock);
if (ret == 0 && args->inode)
@@ -276,6 +292,7 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
struct scoutfs_lock *lock)
{
struct scoutfs_item_count cnt = SIC_TRUNC_EXTENT(inode);
struct scoutfs_inode_info *si = NULL;
LIST_HEAD(ind_locks);
s64 ret = 0;
@@ -290,6 +307,11 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
if (WARN_ON_ONCE(last < iblock))
return -EINVAL;
if (inode) {
si = SCOUTFS_I(inode);
down_write(&si->extent_sem);
}
while (iblock <= last) {
if (inode)
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks,
@@ -321,6 +343,9 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
ret = 0;
}
if (si)
up_write(&si->extent_sem);
return ret;
}
@@ -533,6 +558,38 @@ out:
return ret;
}
/*
* Typically extent item users are serialized by i_mutex. But page
* readers only hold the page lock and need to be protected from writers
* in other pages which can be manipulating neighbouring extents as
* they split and merge.
*/
static int scoutfs_get_block_read(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
int ret;
down_read(&si->extent_sem);
ret = scoutfs_get_block(inode, iblock, bh, create);
up_read(&si->extent_sem);
return ret;
}
static int scoutfs_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
int ret;
down_write(&si->extent_sem);
ret = scoutfs_get_block(inode, iblock, bh, create);
up_write(&si->extent_sem);
return ret;
}
/*
* This is almost never used. We can't block on a cluster lock while
* holding the page lock because lock invalidation gets the page lock
@@ -598,7 +655,7 @@ static int scoutfs_readpage(struct file *file, struct page *page)
return ret;
}
ret = mpage_readpage(page, scoutfs_get_block);
ret = mpage_readpage(page, scoutfs_get_block_read);
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
@@ -646,7 +703,7 @@ static int scoutfs_readpages(struct file *file, struct address_space *mapping,
}
}
ret = mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block);
ret = mpage_readpages(mapping, pages, nr_pages, scoutfs_get_block_read);
out:
scoutfs_unlock(sb, inode_lock, SCOUTFS_LOCK_READ);
BUG_ON(!list_empty(pages));
@@ -655,13 +712,13 @@ out:
static int scoutfs_writepage(struct page *page, struct writeback_control *wbc)
{
return block_write_full_page(page, scoutfs_get_block, wbc);
return block_write_full_page(page, scoutfs_get_block_write, wbc);
}
static int scoutfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
return mpage_writepages(mapping, wbc, scoutfs_get_block);
return mpage_writepages(mapping, wbc, scoutfs_get_block_write);
}
/* fsdata allocated in write_begin and freed in write_end */
@@ -715,7 +772,7 @@ static int scoutfs_write_begin(struct file *file,
ret = scoutfs_dirty_inode_item(inode, wbd->lock);
if (ret == 0)
ret = block_write_begin(mapping, pos, len, flags, pagep,
scoutfs_get_block);
scoutfs_get_block_write);
if (ret)
scoutfs_release_trans(sb);
out:
@@ -903,6 +960,7 @@ static s64 fallocate_extents(struct super_block *sb, struct inode *inode,
long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
const u64 ino = scoutfs_ino(inode);
struct scoutfs_lock *lock = NULL;
@@ -913,6 +971,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
s64 ret;
mutex_lock(&inode->i_mutex);
down_write(&si->extent_sem);
/* XXX support more flags */
if (mode & ~(FALLOC_FL_KEEP_SIZE)) {
@@ -978,6 +1037,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
out:
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
up_write(&si->extent_sem);
mutex_unlock(&inode->i_mutex);
trace_scoutfs_data_fallocate(sb, ino, mode, offset, len, ret);
@@ -998,6 +1058,7 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
struct scoutfs_lock *lock)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
struct data_ext_args args = {
.ino = scoutfs_ino(inode),
@@ -1028,8 +1089,10 @@ int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
if (ret < 0)
goto unlock;
down_write(&si->extent_sem);
ret = scoutfs_ext_insert(sb, &data_ext_ops, &args,
0, count, 0, SEF_OFFLINE);
up_write(&si->extent_sem);
if (ret < 0)
goto unlock;
@@ -1075,6 +1138,7 @@ static int fill_extent(struct fiemap_extent_info *fieinfo,
int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
const u64 ino = scoutfs_ino(inode);
struct scoutfs_lock *lock = NULL;
@@ -1095,8 +1159,8 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
goto out;
/* XXX overkill? */
mutex_lock(&inode->i_mutex);
down_read(&si->extent_sem);
ret = scoutfs_lock_inode(sb, SCOUTFS_LOCK_READ, 0, inode, &lock);
if (ret)
@@ -1148,6 +1212,7 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
ret = fill_extent(fieinfo, &cur, last_flags);
unlock:
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
up_read(&si->extent_sem);
mutex_unlock(&inode->i_mutex);
out:
@@ -1227,8 +1292,9 @@ static struct scoutfs_data_wait *dw_next(struct scoutfs_data_wait *dw)
* Check if we should wait by looking for extents whose flags match.
* Returns 0 if no extents were found or any error encountered.
*
* The caller must have locked the extents before calling, both across
* mounts and within this mount.
* The caller must have acquired a cluster lock that covers the extent
* items. We acquire the extent_sem to protect our read from writers in
* other tasks.
*
* Returns 1 if any file extents in the caller's region matched. If the
* wait struct is provided then it is initialized to be woken when the
@@ -1240,6 +1306,7 @@ int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
u8 sef, u8 op, struct scoutfs_data_wait *dw,
struct scoutfs_lock *lock)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
const u64 ino = scoutfs_ino(inode);
struct data_ext_args args = {
@@ -1272,6 +1339,8 @@ int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
}
}
down_read(&si->extent_sem);
iblock = pos >> SCOUTFS_BLOCK_SM_SHIFT;
last_block = (pos + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
@@ -1308,6 +1377,8 @@ int scoutfs_data_wait_check(struct inode *inode, loff_t pos, loff_t len,
iblock = ext.start + ext.len;
}
up_read(&si->extent_sem);
out:
trace_scoutfs_data_wait_check(sb, ino, pos, len, sef, op, &ext, ret);

View File

@@ -71,29 +71,30 @@ static struct kmem_cache *scoutfs_inode_cachep;
*/
static void scoutfs_inode_ctor(void *obj)
{
struct scoutfs_inode_info *ci = obj;
struct scoutfs_inode_info *si = obj;
mutex_init(&ci->item_mutex);
seqcount_init(&ci->seqcount);
ci->staging = false;
scoutfs_per_task_init(&ci->pt_data_lock);
atomic64_set(&ci->data_waitq.changed, 0);
init_waitqueue_head(&ci->data_waitq.waitq);
init_rwsem(&ci->xattr_rwsem);
RB_CLEAR_NODE(&ci->writeback_node);
init_rwsem(&si->extent_sem);
mutex_init(&si->item_mutex);
seqcount_init(&si->seqcount);
si->staging = false;
scoutfs_per_task_init(&si->pt_data_lock);
atomic64_set(&si->data_waitq.changed, 0);
init_waitqueue_head(&si->data_waitq.waitq);
init_rwsem(&si->xattr_rwsem);
RB_CLEAR_NODE(&si->writeback_node);
inode_init_once(&ci->inode);
inode_init_once(&si->inode);
}
struct inode *scoutfs_alloc_inode(struct super_block *sb)
{
struct scoutfs_inode_info *ci;
struct scoutfs_inode_info *si;
ci = kmem_cache_alloc(scoutfs_inode_cachep, GFP_NOFS);
if (!ci)
si = kmem_cache_alloc(scoutfs_inode_cachep, GFP_NOFS);
if (!si)
return NULL;
return &ci->inode;
return &si->inode;
}
static void scoutfs_i_callback(struct rcu_head *head)
@@ -221,7 +222,7 @@ static void set_item_info(struct scoutfs_inode_info *si,
static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
i_size_write(inode, le64_to_cpu(cinode->size));
set_nlink(inode, le32_to_cpu(cinode->nlink));
@@ -236,23 +237,23 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode)
inode->i_ctime.tv_sec = le64_to_cpu(cinode->ctime.sec);
inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec);
ci->meta_seq = le64_to_cpu(cinode->meta_seq);
ci->data_seq = le64_to_cpu(cinode->data_seq);
ci->data_version = le64_to_cpu(cinode->data_version);
ci->online_blocks = le64_to_cpu(cinode->online_blocks);
ci->offline_blocks = le64_to_cpu(cinode->offline_blocks);
ci->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos);
ci->next_xattr_id = le64_to_cpu(cinode->next_xattr_id);
ci->flags = le32_to_cpu(cinode->flags);
si->meta_seq = le64_to_cpu(cinode->meta_seq);
si->data_seq = le64_to_cpu(cinode->data_seq);
si->data_version = le64_to_cpu(cinode->data_version);
si->online_blocks = le64_to_cpu(cinode->online_blocks);
si->offline_blocks = le64_to_cpu(cinode->offline_blocks);
si->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos);
si->next_xattr_id = le64_to_cpu(cinode->next_xattr_id);
si->flags = le32_to_cpu(cinode->flags);
/*
* i_blocks is initialized from online and offline and is then
* maintained as blocks come and go.
*/
inode->i_blocks = (ci->online_blocks + ci->offline_blocks)
inode->i_blocks = (si->online_blocks + si->offline_blocks)
<< SCOUTFS_BLOCK_SM_SECTOR_SHIFT;
set_item_info(ci, cinode);
set_item_info(si, cinode);
}
static void init_inode_key(struct scoutfs_key *key, u64 ino)
@@ -334,7 +335,7 @@ int scoutfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
u64 new_size, bool truncate)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
LIST_HEAD(ind_locks);
int ret;
@@ -353,7 +354,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
truncate_setsize(inode, new_size);
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
if (truncate)
ci->flags |= SCOUTFS_INO_FLAG_TRUNCATE;
si->flags |= SCOUTFS_INO_FLAG_TRUNCATE;
scoutfs_inode_set_data_seq(inode);
scoutfs_update_inode_item(inode, lock, &ind_locks);
@@ -365,7 +366,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
LIST_HEAD(ind_locks);
int ret;
@@ -375,7 +376,7 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)
if (ret)
return ret;
ci->flags &= ~SCOUTFS_INO_FLAG_TRUNCATE;
si->flags &= ~SCOUTFS_INO_FLAG_TRUNCATE;
scoutfs_update_inode_item(inode, lock, &ind_locks);
scoutfs_release_trans(sb);
@@ -386,13 +387,13 @@ static int clear_truncate_flag(struct inode *inode, struct scoutfs_lock *lock)
int scoutfs_complete_truncate(struct inode *inode, struct scoutfs_lock *lock)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
u64 start;
int ret, err;
trace_scoutfs_complete_truncate(inode, ci->flags);
trace_scoutfs_complete_truncate(inode, si->flags);
if (!(ci->flags & SCOUTFS_INO_FLAG_TRUNCATE))
if (!(si->flags & SCOUTFS_INO_FLAG_TRUNCATE))
return 0;
start = (i_size_read(inode) + SCOUTFS_BLOCK_SM_SIZE - 1) >>
@@ -643,19 +644,19 @@ void scoutfs_inode_get_onoff(struct inode *inode, s64 *on, s64 *off)
static int scoutfs_iget_test(struct inode *inode, void *arg)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
u64 *ino = arg;
return ci->ino == *ino;
return si->ino == *ino;
}
static int scoutfs_iget_set(struct inode *inode, void *arg)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
u64 *ino = arg;
inode->i_ino = *ino;
ci->ino = *ino;
si->ino = *ino;
return 0;
}
@@ -705,7 +706,7 @@ out:
static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
{
struct scoutfs_inode_info *ci = SCOUTFS_I(inode);
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
u64 online_blocks;
u64 offline_blocks;
@@ -732,9 +733,9 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode)
cinode->data_version = cpu_to_le64(scoutfs_inode_data_version(inode));
cinode->online_blocks = cpu_to_le64(online_blocks);
cinode->offline_blocks = cpu_to_le64(offline_blocks);
cinode->next_readdir_pos = cpu_to_le64(ci->next_readdir_pos);
cinode->next_xattr_id = cpu_to_le64(ci->next_xattr_id);
cinode->flags = cpu_to_le32(ci->flags);
cinode->next_readdir_pos = cpu_to_le64(si->next_readdir_pos);
cinode->next_xattr_id = cpu_to_le64(si->next_xattr_id);
cinode->flags = cpu_to_le32(si->flags);
}
/*
@@ -1368,7 +1369,7 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
umode_t mode, dev_t rdev, u64 ino,
struct scoutfs_lock *lock)
{
struct scoutfs_inode_info *ci;
struct scoutfs_inode_info *si;
struct scoutfs_key key;
struct scoutfs_inode sinode;
struct inode *inode;
@@ -1378,16 +1379,16 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir,
if (!inode)
return ERR_PTR(-ENOMEM);
ci = SCOUTFS_I(inode);
ci->ino = ino;
ci->data_version = 0;
ci->online_blocks = 0;
ci->offline_blocks = 0;
ci->next_readdir_pos = SCOUTFS_DIRENT_FIRST_POS;
ci->next_xattr_id = 0;
ci->have_item = false;
atomic64_set(&ci->last_refreshed, lock->refresh_gen);
ci->flags = 0;
si = SCOUTFS_I(inode);
si->ino = ino;
si->data_version = 0;
si->online_blocks = 0;
si->offline_blocks = 0;
si->next_readdir_pos = SCOUTFS_DIRENT_FIRST_POS;
si->next_xattr_id = 0;
si->have_item = false;
atomic64_set(&si->last_refreshed, lock->refresh_gen);
si->flags = 0;
scoutfs_inode_set_meta_seq(inode);
scoutfs_inode_set_data_seq(inode);

View File

@@ -22,6 +22,14 @@ struct scoutfs_inode_info {
u64 offline_blocks;
u32 flags;
/*
* Protects per-inode extent items, most particularly readers
* who want to serialize writers without holding i_mutex. (only
* used in data.c, it's the only place that understands file
* extent items)
*/
struct rw_semaphore extent_sem;
/*
* The in-memory item info caches the current index item values
* so that we can decide to update them with comparisons instead

View File

View File

@@ -16,6 +16,7 @@ createmany-parallel.sh
createmany-large-names.sh
createmany-rename-large-dir.sh
stage-release-race-alloc.sh
stage-multi-part.sh
basic-posix-consistency.sh
dirent-consistency.sh
lock-ex-race-processes.sh

View File

@@ -0,0 +1,69 @@
#
# Stage a large file in multiple parts and have a reader read it while
# it's being staged. This has found problems with extent access
# locking.
#
t_require_commands scoutfs perl cmp rm
FILE_BYTES=$((4 * 1024 * 1024 * 1024))
FILE_BLOCKS=$((FILE_BYTES / 4096))
FRAG_BYTES=$((128 * 1024 * 1024))
FRAG_BLOCKS=$((FRAG_BYTES / 4096))
NR_FRAGS=$((FILE_BLOCKS / FRAG_BLOCKS))
#
# high bandwidth way to generate file contents with predictable
# contents. We use ascii lines with the block identity, padded to 4KB
# with spaces.
#
# $1 is number of 4k blocks to write, and each block gets its block
# number in the line. $2, $3, and $4 are fields that are put in every
# block.
#
gen() {
perl -e 'for (my $i = 0; $i < '$1'; $i++) { printf("mount %020u process %020u file %020u blkno %020u%s\n", '$2', '$3', '$4', $i, " " x 3987); }'
}
release_file() {
local path="$1"
local vers=$(scoutfs stat -s data_version "$path")
scoutfs release "$path" "$vers" 0 $FILE_BLOCKS
}
stage_file() {
local path="$1"
local vers=$(scoutfs stat -s data_version "$path")
local off=0
for a in $(seq 1 $NR_FRAGS); do
scoutfs stage "$path" "$vers" $off $FRAG_BYTES \
<(gen $FRAG_BLOCKS $a $a $a)
((off+=$FRAG_BYTES))
done
}
FILE="$T_D0/file"
whole_file() {
for a in $(seq 1 $NR_FRAGS); do
gen $FRAG_BLOCKS $a $a $a
done
}
#
# just one pass through the file.
#
whole_file > "$FILE"
release_file "$FILE"
cmp "$FILE" <(whole_file) &
pid=$!
stage_file "$FILE"
wait $pid || t_fail "comparison failed"
t_pass