Compare commits

...

8 Commits

Author SHA1 Message Date
Bryant G. Duffy-Ly
1029d5a0fe Enable and Disable correct unit tests for O_DIRECT
Signed-off-by: Bryant G. Duffy-Ly <bduffyly@versity.com>
2022-03-25 09:50:16 -05:00
Bryant Duffy-Ly
9fc759ce47 Fix truncate for O_DIRECT
In the buffered case page tail zeroing happens automatically.
In the O_DIRECT case it does not so we need to add it in our setattr
path just like EXT2. We want to zero the end of the block that contains
i_size during truncate, so we just call block_truncate_page in
set_inode_size.
2022-03-25 09:50:16 -05:00
Bryant Duffy-Ly
bb11617fe3 Fix EOF extent in last block
Currently if there is an extent on the last block the code will only set
EOF on ENOENT. In the case that the last block has an extent it wont go
to the next iteration due to iblock <= last. This then doesnt set the EOF
on the last block in these cases. We want to just allow the loop to keep
looping and rely on if (ext.start > last) to protect us from infinite loop.
2022-03-25 09:50:16 -05:00
Bryant Duffy-Ly
20370c6573 Add O_DIRECT support
We want to first pass a mapping of unwritten extents to the blockdev_direct_IO
call. Then based upon the amount of bytes written we want to convert those
unwritten extents into written.
2022-03-25 09:50:16 -05:00
Zach Brown
9c751c1197 Merge pull request #78 from versity/zab/quorum_leader_visibility
Zab/quorum leader visibility
2022-03-16 09:16:57 -07:00
Zach Brown
875583b7ef Add t_fs_is_leader test helper
The t_server_nr and t_first_client_nr helpers iterated over all the fs
numbers examining their quorum/is_leader files, but clients don't have a
quorum/ directory.  This was causing spurious outputs in tests that were
looking for servers but didn't find it in the first quorum fs number and
made it down into the clients.

Give them a helper that returns 0 for being a leader if the quorum/ dir
doesn't exist.

Signed-off-by: Zach Brown <zab@versity.com>
2022-03-15 16:09:55 -07:00
Zach Brown
38e5aa77c4 Update quorum status files more frequently
We were seeing rare test failures where it looked like is_leader wasn't
set for any of the mounts.   The test that couldn't find a set is_leader
file had just perfomed some mounts so we know that a server was up and
processing requests.

The quorum task wasn't updating the status that's shown in sysfs and
debugfs until after the server started up.  This opened the race where
the server was able to serve mount requests and have the test run to
find no is_leader file set before the quorum task was able to update the
stats and make its election visible.

This updates the quorum task to make its status visible more often,
typically before it does something that will take a while.  The
is_leader will now be visible before the server is started so the test
will always see the file after server starts up and lets mounts finish.

Signed-off-by: Zach Brown <zab@versity.com>
2022-03-15 15:07:57 -07:00
Zach Brown
57a1d75e52 Merge pull request #77 from versity/zab/v1_2_release
Zab/v1 2 release
2022-03-14 18:10:16 -07:00
8 changed files with 359 additions and 33 deletions

View File

@@ -21,6 +21,7 @@
#include <linux/log2.h>
#include <linux/falloc.h>
#include <linux/writeback.h>
#include <linux/aio.h>
#include "format.h"
#include "super.h"
@@ -471,6 +472,7 @@ static int alloc_block(struct super_block *sb, struct inode *inode,
ext->map = blkno;
ext->flags = 0;
ret = 0;
out:
if (ret < 0 && blkno > 0) {
err = scoutfs_free_data(sb, datinf->alloc, datinf->wri,
@@ -488,8 +490,68 @@ out:
return ret;
}
static int alloc_block_dio(struct super_block *sb, struct inode *inode,
struct scoutfs_extent *ext, struct buffer_head *bh,
u64 iblock, struct scoutfs_lock *lock)
{
DECLARE_DATA_INFO(sb, datinf);
const u64 ino = scoutfs_ino(inode);
struct data_ext_args args = {
.ino = ino,
.inode = inode,
.lock = lock,
};
u64 blkno = 0;
u64 blocks = 0;
u64 count = 0;
u64 last;
u8 ext_fl = 0;
int ret = 0;
bool first = true;
int err;
last = (bh->b_size - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
while(blocks < last) {
if (ext->len >= last && first)
count = min_t(u64, last, SCOUTFS_FALLOCATE_ALLOC_LIMIT);
else
count = min_t(u64, last - blocks, SCOUTFS_FALLOCATE_ALLOC_LIMIT);
mutex_lock(&datinf->mutex);
ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri,
&datinf->dalloc, count, &blkno, &count);
if (ret == 0) {
ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock,
count, blkno,
ext_fl | SEF_UNWRITTEN);
if (ret < 0) {
err = scoutfs_free_data(sb, datinf->alloc,
datinf->wri,
&datinf->data_freed,
blkno, count);
BUG_ON(err); /* inconsistent */
}
}
mutex_unlock(&datinf->mutex);
if (ret < 0)
break;
blocks += count;
first = false;
ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
iblock, 1, ext);
if (ret < 0)
break;
}
return ret;
}
static int scoutfs_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
struct buffer_head *bh, int create, bool dio_flag)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
const u64 ino = scoutfs_ino(inode);
@@ -530,7 +592,7 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock,
}
/* convert unwritten to written, could be staging */
if (create && ext.map && (ext.flags & SEF_UNWRITTEN)) {
if (create && ext.map && !dio_flag && (ext.flags & SEF_UNWRITTEN)) {
un.start = iblock;
un.len = 1;
un.map = ext.map + (iblock - ext.start);
@@ -542,11 +604,26 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock,
set_buffer_new(bh);
}
goto out;
} else if (create && ext.map && dio_flag) {
un.start = iblock;
un.len = 1;
un.map = ext.map + (iblock - ext.start);
un.flags = ext.flags;
ret = scoutfs_ext_set(sb, &data_ext_ops, &args,
un.start, un.len, un.map, un.flags);
if (ret == 0) {
ext = un;
set_buffer_new(bh);
}
goto out;
}
/* allocate and map blocks containing our logical block */
if (create && !ext.map) {
ret = alloc_block(sb, inode, &ext, iblock, lock);
if (dio_flag)
ret = alloc_block_dio(sb, inode, &ext, bh, iblock, lock);
else
ret = alloc_block(sb, inode, &ext, iblock, lock);
if (ret == 0)
set_buffer_new(bh);
} else {
@@ -580,25 +657,75 @@ static int scoutfs_get_block_read(struct inode *inode, sector_t iblock,
int ret;
down_read(&si->extent_sem);
ret = scoutfs_get_block(inode, iblock, bh, create);
ret = scoutfs_get_block(inode, iblock, bh, create, false);
up_read(&si->extent_sem);
return ret;
}
static int scoutfs_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
int scoutfs_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
int ret;
down_write(&si->extent_sem);
ret = scoutfs_get_block(inode, iblock, bh, create);
ret = scoutfs_get_block(inode, iblock, bh, create, false);
up_write(&si->extent_sem);
return ret;
}
static int scoutfs_get_block_write_dio(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
struct scoutfs_lock *lock = NULL;
LIST_HEAD(ind_locks);
int ret;
lock = scoutfs_per_task_get(&si->pt_data_lock);
if (WARN_ON_ONCE(!lock)) {
return -EINVAL;
}
if (inode)
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks,
true, false);
else
ret = scoutfs_hold_trans(sb, false);
if (ret)
goto out;
if (inode)
ret = scoutfs_dirty_inode_item(inode, lock);
if (ret < 0)
goto out_unlock;
down_write(&si->extent_sem);
ret = scoutfs_get_block(inode, iblock, bh, create, true);
up_write(&si->extent_sem);
if (inode) {
scoutfs_inode_set_data_seq(inode);
scoutfs_inode_inc_data_version(inode);
inode_inc_iversion(inode);
if (ret > 0)
i_size_write(inode, ret);
scoutfs_update_inode_item(inode, lock, &ind_locks);
}
out_unlock:
scoutfs_release_trans(sb);
scoutfs_inode_index_unlock(sb, &ind_locks);
out:
return ret;
}
/*
* This is almost never used. We can't block on a cluster lock while
* holding the page lock because lock invalidation gets the page lock
@@ -861,6 +988,154 @@ static int scoutfs_write_end(struct file *file, struct address_space *mapping,
return ret;
}
static s64 convert_unwritten_items(struct super_block *sb, struct inode *inode,
u64 ino, u64 iblock, u64 last,
struct scoutfs_lock *lock)
{
struct data_ext_args args = {
.ino = ino,
.inode = inode,
.lock = lock,
};
struct scoutfs_extent ext;
struct scoutfs_extent un;
u64 offset;
s64 ret;
int i;
ret = 0;
for (i = 0; iblock <= last; i++) {
if (i == EXTENTS_PER_HOLD) {
ret = iblock;
break;
}
ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
iblock, 1, &ext);
if (ret < 0) {
if (ret == -ENOENT)
ret = 0;
break;
}
/* done if we went past the region */
if (ext.start > last) {
ret = 0;
break;
}
/* nothing to do when already marked written */
if (!(ext.flags & SEF_UNWRITTEN)) {
iblock = ext.start + ext.len;
continue;
}
iblock = max(ext.start, iblock);
offset = iblock - ext.start;
un.start = iblock;
un.map = ext.map ? ext.map + offset : 0;
un.len = min(ext.len - offset, last - iblock + 1);
un.flags = ext.flags & ~(SEF_OFFLINE|SEF_UNWRITTEN);
ret = scoutfs_ext_set(sb, &data_ext_ops, &args,
un.start, un.len, un.map, un.flags);
if (ret < 0)
break;
iblock += un.len;
}
return ret;
}
static ssize_t
convert_unwritten_extent(struct inode *inode, loff_t offset, ssize_t count)
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
const u64 ino = scoutfs_ino(inode);
struct scoutfs_lock *lock = NULL;
LIST_HEAD(ind_locks);
u64 iblock;
u64 last;
ssize_t ret = 0;
lock = scoutfs_per_task_get(&si->pt_data_lock);
if (WARN_ON_ONCE(!lock)) {
ret = -EINVAL;
goto out;
}
iblock = offset >> SCOUTFS_BLOCK_SM_SHIFT;
last = (offset + count - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
while(iblock <= last) {
if (inode)
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks,
true, false);
else
ret = scoutfs_hold_trans(sb, false);
if (ret)
break;
if (inode)
ret = scoutfs_dirty_inode_item(inode, lock);
else
ret = 0;
if (ret == 0) {
down_write(&si->extent_sem);
ret = convert_unwritten_items(sb, inode, ino, iblock,
last, lock);
up_write(&si->extent_sem);
}
if (ret < 0)
goto out;
if (inode) {
scoutfs_inode_set_data_seq(inode);
scoutfs_inode_inc_data_version(inode);
inode_inc_iversion(inode);
if (ret > 0)
i_size_write(inode, ret);
scoutfs_update_inode_item(inode, lock, &ind_locks);
}
scoutfs_release_trans(sb);
if (inode)
scoutfs_inode_index_unlock(sb, &ind_locks);
if (ret <= 0)
break;
iblock = ret;
}
out:
return ret;
}
static ssize_t
scoutfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t ret;
ret = blockdev_direct_IO(rw, iocb, inode, iov, offset, nr_segs,
scoutfs_get_block_write_dio);
if (ret > 0 && (rw & WRITE))
{
ret = convert_unwritten_extent(inode, offset, ret);
}
return ret;
}
/*
* Try to allocate unwritten extents for any unallocated regions of the
* logical block extent from the caller. The caller manages locks and
@@ -1473,13 +1748,14 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
iblock = start >> SCOUTFS_BLOCK_SM_SHIFT;
last = (start + len - 1) >> SCOUTFS_BLOCK_SM_SHIFT;
while (iblock <= last) {
while (true) {
ret = scoutfs_ext_next(sb, &data_ext_ops, &args,
iblock, 1, &ext);
if (ret < 0) {
if (ret == -ENOENT)
if (ret == -ENOENT) {
ret = 0;
last_flags = FIEMAP_EXTENT_LAST;
last_flags = FIEMAP_EXTENT_LAST;
}
break;
}
@@ -1804,6 +2080,7 @@ const struct address_space_operations scoutfs_file_aops = {
.writepages = scoutfs_writepages,
.write_begin = scoutfs_write_begin,
.write_end = scoutfs_write_end,
.direct_IO = scoutfs_direct_IO,
};
const struct file_operations scoutfs_file_fops = {

View File

@@ -49,6 +49,8 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode,
int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len);
long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len);
int scoutfs_get_block_write(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create);
int scoutfs_data_init_offline_extent(struct inode *inode, u64 size,
struct scoutfs_lock *lock);
int scoutfs_data_move_blocks(struct inode *from, u64 from_off,

View File

@@ -47,6 +47,9 @@ ssize_t scoutfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
DECLARE_DATA_WAIT(dw);
int ret;
if (!is_sync_kiocb(iocb))
return -EINVAL;
retry:
/* protect checked extents from release */
mutex_lock(&inode->i_mutex);
@@ -97,6 +100,9 @@ ssize_t scoutfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
DECLARE_DATA_WAIT(dw);
int ret;
if (!is_sync_kiocb(iocb))
return -EINVAL;
if (iocb->ki_left == 0) /* Does this even happen? */
return 0;

View File

@@ -19,6 +19,7 @@
#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/list_sort.h>
#include <linux/buffer_head.h>
#include "format.h"
#include "super.h"
@@ -354,15 +355,22 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
{
struct scoutfs_inode_info *si = SCOUTFS_I(inode);
struct super_block *sb = inode->i_sb;
SCOUTFS_DECLARE_PER_TASK_ENTRY(pt_ent);
LIST_HEAD(ind_locks);
int ret;
if (!S_ISREG(inode->i_mode))
return 0;
scoutfs_per_task_add(&si->pt_data_lock, &pt_ent, lock);
ret = block_truncate_page(inode->i_mapping, new_size, scoutfs_get_block_write);
scoutfs_per_task_del(&si->pt_data_lock, &pt_ent);
if (ret)
goto out;
ret = scoutfs_inode_index_lock_hold(inode, &ind_locks, true, false);
if (ret)
return ret;
goto out;
if (new_size != i_size_read(inode))
scoutfs_inode_inc_data_version(inode);
@@ -378,6 +386,7 @@ static int set_inode_size(struct inode *inode, struct scoutfs_lock *lock,
scoutfs_release_trans(sb);
scoutfs_inode_index_unlock(sb, &ind_locks);
out:
return ret;
}

View File

@@ -609,6 +609,21 @@ out:
return ret;
}
/*
* The main quorum task maintains its private status. It seemed cleaner
* to occasionally copy the status for showing in sysfs/debugfs files
* than to have the two lock access to shared status. The show copy is
* updated after being modified before the quorum task sleeps for a
* significant amount of time, either waiting on timeouts or interacting
* with the server.
*/
static void update_show_status(struct quorum_info *qinf, struct quorum_status *qst)
{
spin_lock(&qinf->show_lock);
qinf->show_status = *qst;
spin_unlock(&qinf->show_lock);
}
/*
* The quorum work always runs in the background of quorum member
* mounts. It's responsible for starting and stopping the server if
@@ -651,6 +666,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
while (!(qinf->shutdown || scoutfs_forcing_unmount(sb))) {
update_show_status(qinf, &qst);
ret = recv_msg(sb, &msg, qst.timeout);
if (ret < 0) {
if (ret != -ETIMEDOUT && ret != -EAGAIN) {
@@ -681,10 +698,6 @@ static void scoutfs_quorum_worker(struct work_struct *work)
scoutfs_inc_counter(sb, quorum_send_resignation);
}
spin_lock(&qinf->show_lock);
qinf->show_status = qst;
spin_unlock(&qinf->show_lock);
trace_scoutfs_quorum_loop(sb, qst.role, qst.term, qst.vote_for,
qst.vote_bits,
ktime_to_timespec64(qst.timeout));
@@ -695,6 +708,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
if (qst.role == LEADER) {
scoutfs_warn(sb, "saw msg type %u from %u for term %llu while leader in term %llu, shutting down server.",
msg.type, msg.from, msg.term, qst.term);
update_show_status(qinf, &qst);
scoutfs_server_stop(sb);
}
qst.role = FOLLOWER;
@@ -758,6 +772,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
qst.term);
qst.timeout = heartbeat_interval();
update_show_status(qinf, &qst);
/* record that we've been elected before starting up server */
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_ELECT, qst.term, true);
if (ret < 0)
@@ -817,6 +833,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
}
}
update_show_status(qinf, &qst);
/* always try to stop a running server as we stop */
if (test_bit(QINF_FLAG_SERVER, &qinf->flags)) {
scoutfs_server_stop(sb);

View File

@@ -75,6 +75,20 @@ t_fs_nrs()
seq 0 $((T_NR_MOUNTS - 1))
}
#
# outputs "1" if the fs number has "1" in its quorum/is_leader file.
# All other cases output 0, including the fs nr being a client which
# won't have a quorum/ dir.
#
t_fs_is_leader()
{
if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader 2>/dev/null)" == "1" ]; then
echo "1"
else
echo "0"
fi
}
#
# Output the mount nr of the current server. This takes no steps to
# ensure that the server doesn't shut down and have some other mount
@@ -83,7 +97,7 @@ t_fs_nrs()
t_server_nr()
{
for i in $(t_fs_nrs); do
if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader)" == "1" ]; then
if [ "$(t_fs_is_leader $i)" == "1" ]; then
echo $i
return
fi
@@ -101,7 +115,7 @@ t_server_nr()
t_first_client_nr()
{
for i in $(t_fs_nrs); do
if [ "$(cat $(t_sysfs_path $i)/quorum/is_leader)" == "0" ]; then
if [ "$(t_fs_is_leader $i)" == "0" ]; then
echo $i
return
fi

View File

@@ -45,9 +45,14 @@ generic/107
generic/117
generic/124
generic/129
generic/130
generic/131
generic/135
generic/169
generic/184
generic/211
generic/212
generic/214
generic/221
generic/228
generic/236
@@ -100,13 +105,9 @@ generic/078
generic/079
generic/081
generic/082
generic/091
generic/094
generic/096
generic/110
generic/111
generic/113
generic/114
generic/115
generic/116
generic/118
@@ -115,9 +116,7 @@ generic/121
generic/122
generic/123
generic/128
generic/130
generic/134
generic/135
generic/136
generic/138
generic/139
@@ -165,7 +164,6 @@ generic/194
generic/195
generic/196
generic/197
generic/198
generic/199
generic/200
generic/201
@@ -173,11 +171,6 @@ generic/202
generic/203
generic/205
generic/206
generic/207
generic/210
generic/211
generic/212
generic/214
generic/216
generic/217
generic/218
@@ -185,13 +178,11 @@ generic/219
generic/220
generic/222
generic/223
generic/225
generic/227
generic/229
generic/230
generic/235
generic/238
generic/240
generic/244
generic/250
generic/252
@@ -203,7 +194,6 @@ generic/259
generic/260
generic/261
generic/262
generic/263
generic/264
generic/265
generic/266
@@ -282,4 +272,4 @@ shared/004
shared/032
shared/051
shared/289
Passed all 75 tests
Passed all 80 tests

View File

@@ -64,19 +64,29 @@ generic/029 # mmap missing
generic/030 # mmap missing
generic/075 # file content mismatch failures (fds, etc)
generic/080 # mmap missing
generic/091 # skip fsx tests
generic/094 # odirect streaming pre-alloc treated as failure in xfstests
generic/103 # enospc causes trans commit failures
generic/105 # needs trigage: something about acls
generic/108 # mount fails on failing device?
generic/112 # file content mismatch failures (fds, etc)
generic/113 # block aio dio runs
generic/114 # block aio dio runs
generic/120 # (can't exec 'cause no mmap)
generic/126 # (can't exec 'cause no mmap)
generic/141 # mmap missing
generic/198 # block aio dio runs
generic/207 # block aio dio runs
generic/210 # block aio dio runs
generic/213 # enospc causes trans commit failures
generic/215 # mmap missing
generic/225 # odirect streaming pre-alloc treated as failure in xfstests
generic/237 # wrong error return from failing setfacl?
generic/240 # block aio dio runs
generic/246 # mmap missing
generic/247 # mmap missing
generic/248 # mmap missing
generic/263 # do not support allocate mode FALLOC_FL_PUNCH_HOLE, FALLOC_FL_KEEP_SIZE, FALLOC_FL_ZERO_RANGE...
generic/319 # utils output change? update branch?
generic/321 # requires selinux enabled for '+' in ls?
generic/325 # mmap missing