mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-09 13:23:14 +00:00
Compare commits
7 Commits
zab/worm
...
zab/v1_5_r
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1abe97351d | ||
|
|
f757e29915 | ||
|
|
31e474c5fa | ||
|
|
dcf8202d7c | ||
|
|
ae55fa3153 | ||
|
|
7f9f21317c | ||
|
|
0d4bf83da3 |
@@ -1,6 +1,25 @@
|
||||
Versity ScoutFS Release Notes
|
||||
=============================
|
||||
|
||||
---
|
||||
v1.5
|
||||
\
|
||||
*Jun 21, 2022*
|
||||
|
||||
* **Fix persistent error during server startup**
|
||||
\
|
||||
Fixed a case where the server would always hit a consistent error on
|
||||
seartup, preventing the system from mounting. This required a rare
|
||||
but valid state across the clients.
|
||||
|
||||
* **Fix a client hang that would lead to fencing**
|
||||
\
|
||||
The client module's use of in-kernel networking was missing annotation
|
||||
that could lead to communication hanging. The server would fence the
|
||||
client when it stopped communicating. This could be identified by the
|
||||
server fencing a client after it disconnected with no attempt by the
|
||||
client to reconnect.
|
||||
|
||||
---
|
||||
v1.4
|
||||
\
|
||||
|
||||
@@ -84,6 +84,21 @@ static u64 smallest_order_length(u64 len)
|
||||
return 1ULL << (free_extent_order(len) * 3);
|
||||
}
|
||||
|
||||
/*
|
||||
* An extent modification dirties three distinct leaves of an allocator
|
||||
* btree as it adds and removes the blkno and size sorted items for the
|
||||
* old and new lengths of the extent. Dirtying the paths to these
|
||||
* leaves can grow the tree and grow/shrink neighbours at each level.
|
||||
* We over-estimate the number of blocks allocated and freed (the paths
|
||||
* share a root, growth doesn't free) to err on the simpler and safer
|
||||
* side. The overhead is minimal given the relatively large list blocks
|
||||
* and relatively short allocator trees.
|
||||
*/
|
||||
static u32 extent_mod_blocks(u32 height)
|
||||
{
|
||||
return ((1 + height) * 2) * 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* Free extents don't have flags and are stored in two indexes sorted by
|
||||
* block location and by length order, largest first. The location key
|
||||
@@ -877,6 +892,14 @@ static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *r
|
||||
* -ENOENT is returned if we run out of extents in the source tree
|
||||
* before moving the total.
|
||||
*
|
||||
* If meta_reserved is non-zero then -EINPROGRESS can be returned if the
|
||||
* current meta allocator's avail blocks or room for freed blocks would
|
||||
* have fallen under the reserved amount. The could have been
|
||||
* successfully dirtied in this case but the number of blocks moved is
|
||||
* not returned. The caller is expected to deal with the partial
|
||||
* progress by commiting the dirty trees and examining the resulting
|
||||
* modified trees to see if they need to continue moving extents.
|
||||
*
|
||||
* The caller can specify that extents in the source tree should first
|
||||
* be found based on their zone bitmaps. We'll first try to find
|
||||
* extents in the exclusive zones, then vacant zones, and then we'll
|
||||
@@ -891,7 +914,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 total,
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks)
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved)
|
||||
{
|
||||
struct alloc_ext_args args = {
|
||||
.alloc = alloc,
|
||||
@@ -941,6 +964,14 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
if (meta_reserved != 0 &&
|
||||
scoutfs_alloc_meta_low(sb, alloc, meta_reserved +
|
||||
extent_mod_blocks(src->root.height) +
|
||||
extent_mod_blocks(dst->root.height))) {
|
||||
ret = -EINPROGRESS;
|
||||
break;
|
||||
}
|
||||
|
||||
/* searching set start/len, finish initializing alloced extent */
|
||||
ext.map = found.map ? ext.start - found.start + found.map : 0;
|
||||
ext.flags = found.flags;
|
||||
@@ -1065,15 +1096,6 @@ out:
|
||||
* than completely exhausting the avail list or overflowing the freed
|
||||
* list.
|
||||
*
|
||||
* An extent modification dirties three distinct leaves of an allocator
|
||||
* btree as it adds and removes the blkno and size sorted items for the
|
||||
* old and new lengths of the extent. Dirtying the paths to these
|
||||
* leaves can grow the tree and grow/shrink neighbours at each level.
|
||||
* We over-estimate the number of blocks allocated and freed (the paths
|
||||
* share a root, growth doesn't free) to err on the simpler and safer
|
||||
* side. The overhead is minimal given the relatively large list blocks
|
||||
* and relatively short allocator trees.
|
||||
*
|
||||
* The caller tells us how many extents they're about to modify and how
|
||||
* many other additional blocks they may cow manually. And finally, the
|
||||
* caller could be the first to dirty the avail and freed blocks in the
|
||||
@@ -1082,7 +1104,7 @@ out:
|
||||
static bool list_has_blocks(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_alloc_root *root, u32 extents, u32 addl_blocks)
|
||||
{
|
||||
u32 tree_blocks = (((1 + root->root.height) * 2) * 3) * extents;
|
||||
u32 tree_blocks = extent_mod_blocks(root->root.height) * extents;
|
||||
u32 most = 1 + tree_blocks + addl_blocks;
|
||||
|
||||
if (le32_to_cpu(alloc->avail.first_nr) < most) {
|
||||
|
||||
@@ -131,7 +131,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 total,
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks);
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved);
|
||||
int scoutfs_alloc_insert(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
|
||||
u64 start, u64 len);
|
||||
|
||||
@@ -991,6 +991,8 @@ static void scoutfs_net_listen_worker(struct work_struct *work)
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
acc_sock->sk->sk_allocation = GFP_NOFS;
|
||||
|
||||
/* inherit accepted request funcs from listening conn */
|
||||
acc_conn = scoutfs_net_alloc_conn(sb, conn->notify_up,
|
||||
conn->notify_down,
|
||||
@@ -1053,6 +1055,8 @@ static void scoutfs_net_connect_worker(struct work_struct *work)
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
sock->sk->sk_allocation = GFP_NOFS;
|
||||
|
||||
/* caller specified connect timeout */
|
||||
tv.tv_sec = conn->connect_timeout_ms / MSEC_PER_SEC;
|
||||
tv.tv_usec = (conn->connect_timeout_ms % MSEC_PER_SEC) * USEC_PER_MSEC;
|
||||
@@ -1450,6 +1454,8 @@ int scoutfs_net_bind(struct super_block *sb,
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
sock->sk->sk_allocation = GFP_NOFS;
|
||||
|
||||
optval = 1;
|
||||
ret = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
|
||||
(char *)&optval, sizeof(optval));
|
||||
|
||||
@@ -689,23 +689,18 @@ static int alloc_move_refill_zoned(struct super_block *sb, struct scoutfs_alloc_
|
||||
return scoutfs_alloc_move(sb, &server->alloc, &server->wri, dst, src,
|
||||
min(target - le64_to_cpu(dst->total_len),
|
||||
le64_to_cpu(src->total_len)),
|
||||
exclusive, vacant, zone_blocks);
|
||||
}
|
||||
|
||||
static inline int alloc_move_refill(struct super_block *sb, struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 lo, u64 target)
|
||||
{
|
||||
return alloc_move_refill_zoned(sb, dst, src, lo, target, NULL, NULL, 0);
|
||||
exclusive, vacant, zone_blocks, 0);
|
||||
}
|
||||
|
||||
static int alloc_move_empty(struct super_block *sb,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src)
|
||||
struct scoutfs_alloc_root *src, u64 meta_reserved)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
return scoutfs_alloc_move(sb, &server->alloc, &server->wri,
|
||||
dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0);
|
||||
dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0,
|
||||
meta_reserved);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1272,6 +1267,7 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
char *err_str = NULL;
|
||||
u64 nr;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
if (arg_len != 0) {
|
||||
ret = -EINVAL;
|
||||
@@ -1315,16 +1311,27 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (ret != -ENOENT) {
|
||||
/* need to sync lt with respect to changes in other structures */
|
||||
scoutfs_key_init_log_trees(&key, le64_to_cpu(lt.rid), le64_to_cpu(lt.nr));
|
||||
ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri,
|
||||
&super->logs_root, &key);
|
||||
if (ret < 0) {
|
||||
err_str = "dirtying lt btree key";
|
||||
goto unlock;
|
||||
}
|
||||
}
|
||||
|
||||
/* drops and re-acquires the mutex and commit if it has to wait */
|
||||
ret = finalize_and_start_log_merge(sb, <, rid, &hold);
|
||||
if (ret < 0)
|
||||
goto unlock;
|
||||
goto update;
|
||||
|
||||
if (get_volopt_val(server, SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR, &data_zone_blocks)) {
|
||||
ret = get_data_alloc_zone_bits(sb, rid, exclusive, vacant, data_zone_blocks);
|
||||
if (ret < 0) {
|
||||
err_str = "getting alloc zone bits";
|
||||
goto unlock;
|
||||
goto update;
|
||||
}
|
||||
} else {
|
||||
data_zone_blocks = 0;
|
||||
@@ -1341,13 +1348,13 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
<.meta_freed);
|
||||
if (ret < 0) {
|
||||
err_str = "splicing committed meta_freed";
|
||||
goto unlock;
|
||||
goto update;
|
||||
}
|
||||
|
||||
ret = alloc_move_empty(sb, &super->data_alloc, <.data_freed);
|
||||
ret = alloc_move_empty(sb, &super->data_alloc, <.data_freed, 0);
|
||||
if (ret < 0) {
|
||||
err_str = "emptying committed data_freed";
|
||||
goto unlock;
|
||||
goto update;
|
||||
}
|
||||
|
||||
ret = scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri,
|
||||
@@ -1356,7 +1363,7 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
SCOUTFS_SERVER_META_FILL_TARGET);
|
||||
if (ret < 0) {
|
||||
err_str = "filling meta_avail";
|
||||
goto unlock;
|
||||
goto update;
|
||||
}
|
||||
|
||||
if (le64_to_cpu(server->meta_avail->total_len) <= scoutfs_server_reserved_meta_blocks(sb))
|
||||
@@ -1369,7 +1376,7 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
exclusive, vacant, data_zone_blocks);
|
||||
if (ret < 0) {
|
||||
err_str = "refilling data_avail";
|
||||
goto unlock;
|
||||
goto update;
|
||||
}
|
||||
|
||||
if (le64_to_cpu(lt.data_avail.total_len) < SCOUTFS_SERVER_DATA_FILL_LO)
|
||||
@@ -1389,7 +1396,7 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
if (ret < 0) {
|
||||
zero_data_alloc_zone_bits(<);
|
||||
err_str = "setting data_avail zone bits";
|
||||
goto unlock;
|
||||
goto update;
|
||||
}
|
||||
|
||||
lt.data_alloc_zone_blocks = cpu_to_le64(data_zone_blocks);
|
||||
@@ -1398,13 +1405,18 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
/* give the transaction a new seq (must have been ==) */
|
||||
lt.get_trans_seq = cpu_to_le64(scoutfs_server_next_seq(sb));
|
||||
|
||||
update:
|
||||
/* update client's log tree's item */
|
||||
scoutfs_key_init_log_trees(&key, le64_to_cpu(lt.rid),
|
||||
le64_to_cpu(lt.nr));
|
||||
ret = scoutfs_btree_force(sb, &server->alloc, &server->wri,
|
||||
scoutfs_key_init_log_trees(&key, le64_to_cpu(lt.rid), le64_to_cpu(lt.nr));
|
||||
err = scoutfs_btree_force(sb, &server->alloc, &server->wri,
|
||||
&super->logs_root, &key, <, sizeof(lt));
|
||||
if (ret < 0)
|
||||
err_str = "updating log trees";
|
||||
BUG_ON(err < 0); /* can duplicate extents.. move dst in super, still in in lt src */
|
||||
if (err < 0) {
|
||||
if (ret == 0) {
|
||||
ret = err;
|
||||
err_str = "updating log trees";
|
||||
}
|
||||
}
|
||||
|
||||
unlock:
|
||||
if (unlock_alloc)
|
||||
@@ -1544,9 +1556,11 @@ static int server_get_roots(struct super_block *sb,
|
||||
* read and we finalize the tree so that it will be merged. We reclaim
|
||||
* all the allocator items.
|
||||
*
|
||||
* The caller holds the commit rwsem which means we do all this work in
|
||||
* one server commit. We'll need to keep the total amount of blocks in
|
||||
* trees in check.
|
||||
* The caller holds the commit rwsem which means we have to do our work
|
||||
* in one commit. The alocator btrees can be very large and very
|
||||
* fragmented. We return -EINPROGRESS if we couldn't fully reclaim the
|
||||
* allocators in one commit. The caller should apply the current
|
||||
* commit and call again in a new commit.
|
||||
*
|
||||
* By the time we're evicting a client they've either synced their data
|
||||
* or have been forcefully removed. The free blocks in the allocator
|
||||
@@ -1606,9 +1620,9 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
|
||||
}
|
||||
|
||||
/*
|
||||
* All of these can return errors after having modified the
|
||||
* allocator trees. We have to try and update the roots in the
|
||||
* log item.
|
||||
* All of these can return errors, perhaps indicating successful
|
||||
* partial progress, after having modified the allocator trees.
|
||||
* We always have to update the roots in the log item.
|
||||
*/
|
||||
mutex_lock(&server->alloc_mutex);
|
||||
ret = (err_str = "splice meta_freed to other_freed",
|
||||
@@ -1618,18 +1632,21 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
|
||||
scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, server->other_freed,
|
||||
<.meta_avail)) ?:
|
||||
(err_str = "empty data_avail",
|
||||
alloc_move_empty(sb, &super->data_alloc, <.data_avail)) ?:
|
||||
alloc_move_empty(sb, &super->data_alloc, <.data_avail, 100)) ?:
|
||||
(err_str = "empty data_freed",
|
||||
alloc_move_empty(sb, &super->data_alloc, <.data_freed));
|
||||
alloc_move_empty(sb, &super->data_alloc, <.data_freed, 100));
|
||||
mutex_unlock(&server->alloc_mutex);
|
||||
|
||||
/* the transaction is no longer open */
|
||||
lt.commit_trans_seq = lt.get_trans_seq;
|
||||
/* only finalize, allowing merging, once the allocators are fully freed */
|
||||
if (ret == 0) {
|
||||
/* the transaction is no longer open */
|
||||
lt.commit_trans_seq = lt.get_trans_seq;
|
||||
|
||||
/* the mount is no longer writing to the zones */
|
||||
zero_data_alloc_zone_bits(<);
|
||||
le64_add_cpu(<.flags, SCOUTFS_LOG_TREES_FINALIZED);
|
||||
lt.finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb));
|
||||
/* the mount is no longer writing to the zones */
|
||||
zero_data_alloc_zone_bits(<);
|
||||
le64_add_cpu(<.flags, SCOUTFS_LOG_TREES_FINALIZED);
|
||||
lt.finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb));
|
||||
}
|
||||
|
||||
err = scoutfs_btree_update(sb, &server->alloc, &server->wri,
|
||||
&super->logs_root, &key, <, sizeof(lt));
|
||||
@@ -1638,7 +1655,7 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
|
||||
out:
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
|
||||
if (ret < 0)
|
||||
if (ret < 0 && ret != -EINPROGRESS)
|
||||
scoutfs_err(sb, "server error %d reclaiming log trees for rid %016llx: %s",
|
||||
ret, rid, err_str);
|
||||
|
||||
@@ -3536,26 +3553,37 @@ struct farewell_request {
|
||||
* Reclaim all the resources for a mount which has gone away. It's sent
|
||||
* us a farewell promising to leave or we actively fenced it.
|
||||
*
|
||||
* It's safe to call this multiple times for a given rid. Each
|
||||
* individual action knows to recognize that it's already been performed
|
||||
* and return success.
|
||||
* This can be called multiple times across different servers for
|
||||
* different reclaim attempts. The existence of the mounted_client item
|
||||
* triggers reclaim and must be deleted last. Each step knows that it
|
||||
* can be called multiple times and safely recognizes that its work
|
||||
* might have already been done.
|
||||
*
|
||||
* Some steps (reclaiming large fragmented allocators) may need multiple
|
||||
* calls to complete. They return -EINPROGRESS which tells us to apply
|
||||
* the server commit and retry.
|
||||
*/
|
||||
static int reclaim_rid(struct super_block *sb, u64 rid)
|
||||
{
|
||||
COMMIT_HOLD(hold);
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
server_hold_commit(sb, &hold);
|
||||
do {
|
||||
server_hold_commit(sb, &hold);
|
||||
|
||||
/* delete mounted client last, recovery looks for it */
|
||||
ret = scoutfs_lock_server_farewell(sb, rid) ?:
|
||||
reclaim_open_log_tree(sb, rid) ?:
|
||||
cancel_srch_compact(sb, rid) ?:
|
||||
cancel_log_merge(sb, rid) ?:
|
||||
scoutfs_omap_remove_rid(sb, rid) ?:
|
||||
delete_mounted_client(sb, rid);
|
||||
err = scoutfs_lock_server_farewell(sb, rid) ?:
|
||||
reclaim_open_log_tree(sb, rid) ?:
|
||||
cancel_srch_compact(sb, rid) ?:
|
||||
cancel_log_merge(sb, rid) ?:
|
||||
scoutfs_omap_remove_rid(sb, rid) ?:
|
||||
delete_mounted_client(sb, rid);
|
||||
|
||||
return server_apply_commit(sb, &hold, ret);
|
||||
ret = server_apply_commit(sb, &hold, err == -EINPROGRESS ? 0 : err);
|
||||
|
||||
} while (err == -EINPROGRESS && ret == 0);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user