mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-10 05:37:25 +00:00
Compare commits
9 Commits
zab/worm
...
zab/cluste
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a1f917d5bb | ||
|
|
68aa9e5ce6 | ||
|
|
28079610ed | ||
|
|
05cd4ebe92 | ||
|
|
0041b599ab | ||
|
|
f30e545870 | ||
|
|
cf046f26c4 | ||
|
|
7f9f21317c | ||
|
|
0d4bf83da3 |
@@ -84,6 +84,21 @@ static u64 smallest_order_length(u64 len)
|
||||
return 1ULL << (free_extent_order(len) * 3);
|
||||
}
|
||||
|
||||
/*
|
||||
* An extent modification dirties three distinct leaves of an allocator
|
||||
* btree as it adds and removes the blkno and size sorted items for the
|
||||
* old and new lengths of the extent. Dirtying the paths to these
|
||||
* leaves can grow the tree and grow/shrink neighbours at each level.
|
||||
* We over-estimate the number of blocks allocated and freed (the paths
|
||||
* share a root, growth doesn't free) to err on the simpler and safer
|
||||
* side. The overhead is minimal given the relatively large list blocks
|
||||
* and relatively short allocator trees.
|
||||
*/
|
||||
static u32 extent_mod_blocks(u32 height)
|
||||
{
|
||||
return ((1 + height) * 2) * 3;
|
||||
}
|
||||
|
||||
/*
|
||||
* Free extents don't have flags and are stored in two indexes sorted by
|
||||
* block location and by length order, largest first. The location key
|
||||
@@ -877,6 +892,14 @@ static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *r
|
||||
* -ENOENT is returned if we run out of extents in the source tree
|
||||
* before moving the total.
|
||||
*
|
||||
* If meta_reserved is non-zero then -EINPROGRESS can be returned if the
|
||||
* current meta allocator's avail blocks or room for freed blocks would
|
||||
* have fallen under the reserved amount. The could have been
|
||||
* successfully dirtied in this case but the number of blocks moved is
|
||||
* not returned. The caller is expected to deal with the partial
|
||||
* progress by commiting the dirty trees and examining the resulting
|
||||
* modified trees to see if they need to continue moving extents.
|
||||
*
|
||||
* The caller can specify that extents in the source tree should first
|
||||
* be found based on their zone bitmaps. We'll first try to find
|
||||
* extents in the exclusive zones, then vacant zones, and then we'll
|
||||
@@ -891,7 +914,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 total,
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks)
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved)
|
||||
{
|
||||
struct alloc_ext_args args = {
|
||||
.alloc = alloc,
|
||||
@@ -941,6 +964,14 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
if (meta_reserved != 0 &&
|
||||
scoutfs_alloc_meta_low(sb, alloc, meta_reserved +
|
||||
extent_mod_blocks(src->root.height) +
|
||||
extent_mod_blocks(dst->root.height))) {
|
||||
ret = -EINPROGRESS;
|
||||
break;
|
||||
}
|
||||
|
||||
/* searching set start/len, finish initializing alloced extent */
|
||||
ext.map = found.map ? ext.start - found.start + found.map : 0;
|
||||
ext.flags = found.flags;
|
||||
@@ -1065,15 +1096,6 @@ out:
|
||||
* than completely exhausting the avail list or overflowing the freed
|
||||
* list.
|
||||
*
|
||||
* An extent modification dirties three distinct leaves of an allocator
|
||||
* btree as it adds and removes the blkno and size sorted items for the
|
||||
* old and new lengths of the extent. Dirtying the paths to these
|
||||
* leaves can grow the tree and grow/shrink neighbours at each level.
|
||||
* We over-estimate the number of blocks allocated and freed (the paths
|
||||
* share a root, growth doesn't free) to err on the simpler and safer
|
||||
* side. The overhead is minimal given the relatively large list blocks
|
||||
* and relatively short allocator trees.
|
||||
*
|
||||
* The caller tells us how many extents they're about to modify and how
|
||||
* many other additional blocks they may cow manually. And finally, the
|
||||
* caller could be the first to dirty the avail and freed blocks in the
|
||||
@@ -1082,7 +1104,7 @@ out:
|
||||
static bool list_has_blocks(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_alloc_root *root, u32 extents, u32 addl_blocks)
|
||||
{
|
||||
u32 tree_blocks = (((1 + root->root.height) * 2) * 3) * extents;
|
||||
u32 tree_blocks = extent_mod_blocks(root->root.height) * extents;
|
||||
u32 most = 1 + tree_blocks + addl_blocks;
|
||||
|
||||
if (le32_to_cpu(alloc->avail.first_nr) < most) {
|
||||
|
||||
@@ -131,7 +131,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 total,
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks);
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved);
|
||||
int scoutfs_alloc_insert(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
|
||||
u64 start, u64 len);
|
||||
|
||||
765
kmod/src/lock.c
765
kmod/src/lock.c
File diff suppressed because it is too large
Load Diff
@@ -1,6 +1,8 @@
|
||||
#ifndef _SCOUTFS_LOCK_H_
|
||||
#define _SCOUTFS_LOCK_H_
|
||||
|
||||
#include <linux/rhashtable.h>
|
||||
|
||||
#include "key.h"
|
||||
#include "tseq.h"
|
||||
|
||||
@@ -19,20 +21,24 @@ struct inode_deletion_lock_data;
|
||||
*/
|
||||
struct scoutfs_lock {
|
||||
struct super_block *sb;
|
||||
atomic_t refcount;
|
||||
spinlock_t lock;
|
||||
struct rcu_head rcu_head;
|
||||
struct scoutfs_key start;
|
||||
struct scoutfs_key end;
|
||||
struct rb_node node;
|
||||
struct rhash_head ht_head;
|
||||
struct rb_node range_node;
|
||||
u64 refresh_gen;
|
||||
u64 write_seq;
|
||||
u64 dirty_trans_seq;
|
||||
struct list_head lru_head;
|
||||
int lru_on_list;
|
||||
wait_queue_head_t waitq;
|
||||
unsigned long request_pending:1,
|
||||
invalidate_pending:1;
|
||||
|
||||
struct list_head inv_head; /* entry in linfo's list of locks with invalidations */
|
||||
struct list_head inv_list; /* list of lock's invalidation requests */
|
||||
struct list_head inv_req_list; /* list of lock's invalidation requests */
|
||||
struct list_head shrink_head;
|
||||
|
||||
spinlock_t cov_list_lock;
|
||||
|
||||
@@ -1023,6 +1023,7 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
|
||||
__field(unsigned char, request_pending)
|
||||
__field(unsigned char, invalidate_pending)
|
||||
__field(int, mode)
|
||||
__field(unsigned int, refcount)
|
||||
__field(unsigned int, waiters_cw)
|
||||
__field(unsigned int, waiters_pr)
|
||||
__field(unsigned int, waiters_ex)
|
||||
@@ -1038,6 +1039,7 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
|
||||
__entry->request_pending = lck->request_pending;
|
||||
__entry->invalidate_pending = lck->invalidate_pending;
|
||||
__entry->mode = lck->mode;
|
||||
__entry->refcount = atomic_read(&lck->refcount);
|
||||
__entry->waiters_pr = lck->waiters[SCOUTFS_LOCK_READ];
|
||||
__entry->waiters_ex = lck->waiters[SCOUTFS_LOCK_WRITE];
|
||||
__entry->waiters_cw = lck->waiters[SCOUTFS_LOCK_WRITE_ONLY];
|
||||
@@ -1045,10 +1047,10 @@ DECLARE_EVENT_CLASS(scoutfs_lock_class,
|
||||
__entry->users_ex = lck->users[SCOUTFS_LOCK_WRITE];
|
||||
__entry->users_cw = lck->users[SCOUTFS_LOCK_WRITE_ONLY];
|
||||
),
|
||||
TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" mode %u reqpnd %u invpnd %u rfrgen %llu waiters: pr %u ex %u cw %u users: pr %u ex %u cw %u",
|
||||
TP_printk(SCSBF" start "SK_FMT" end "SK_FMT" mode %u reqpnd %u invpnd %u rfrgen %llu rfcnt %d waiters: pr %u ex %u cw %u users: pr %u ex %u cw %u",
|
||||
SCSB_TRACE_ARGS, sk_trace_args(start), sk_trace_args(end),
|
||||
__entry->mode, __entry->request_pending,
|
||||
__entry->invalidate_pending, __entry->refresh_gen,
|
||||
__entry->invalidate_pending, __entry->refresh_gen, __entry->refcount,
|
||||
__entry->waiters_pr, __entry->waiters_ex, __entry->waiters_cw,
|
||||
__entry->users_pr, __entry->users_ex, __entry->users_cw)
|
||||
);
|
||||
|
||||
@@ -689,23 +689,18 @@ static int alloc_move_refill_zoned(struct super_block *sb, struct scoutfs_alloc_
|
||||
return scoutfs_alloc_move(sb, &server->alloc, &server->wri, dst, src,
|
||||
min(target - le64_to_cpu(dst->total_len),
|
||||
le64_to_cpu(src->total_len)),
|
||||
exclusive, vacant, zone_blocks);
|
||||
}
|
||||
|
||||
static inline int alloc_move_refill(struct super_block *sb, struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 lo, u64 target)
|
||||
{
|
||||
return alloc_move_refill_zoned(sb, dst, src, lo, target, NULL, NULL, 0);
|
||||
exclusive, vacant, zone_blocks, 0);
|
||||
}
|
||||
|
||||
static int alloc_move_empty(struct super_block *sb,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src)
|
||||
struct scoutfs_alloc_root *src, u64 meta_reserved)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
return scoutfs_alloc_move(sb, &server->alloc, &server->wri,
|
||||
dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0);
|
||||
dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0,
|
||||
meta_reserved);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1344,7 +1339,7 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
ret = alloc_move_empty(sb, &super->data_alloc, <.data_freed);
|
||||
ret = alloc_move_empty(sb, &super->data_alloc, <.data_freed, 0);
|
||||
if (ret < 0) {
|
||||
err_str = "emptying committed data_freed";
|
||||
goto unlock;
|
||||
@@ -1544,9 +1539,11 @@ static int server_get_roots(struct super_block *sb,
|
||||
* read and we finalize the tree so that it will be merged. We reclaim
|
||||
* all the allocator items.
|
||||
*
|
||||
* The caller holds the commit rwsem which means we do all this work in
|
||||
* one server commit. We'll need to keep the total amount of blocks in
|
||||
* trees in check.
|
||||
* The caller holds the commit rwsem which means we have to do our work
|
||||
* in one commit. The alocator btrees can be very large and very
|
||||
* fragmented. We return -EINPROGRESS if we couldn't fully reclaim the
|
||||
* allocators in one commit. The caller should apply the current
|
||||
* commit and call again in a new commit.
|
||||
*
|
||||
* By the time we're evicting a client they've either synced their data
|
||||
* or have been forcefully removed. The free blocks in the allocator
|
||||
@@ -1606,9 +1603,9 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
|
||||
}
|
||||
|
||||
/*
|
||||
* All of these can return errors after having modified the
|
||||
* allocator trees. We have to try and update the roots in the
|
||||
* log item.
|
||||
* All of these can return errors, perhaps indicating successful
|
||||
* partial progress, after having modified the allocator trees.
|
||||
* We always have to update the roots in the log item.
|
||||
*/
|
||||
mutex_lock(&server->alloc_mutex);
|
||||
ret = (err_str = "splice meta_freed to other_freed",
|
||||
@@ -1618,18 +1615,21 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
|
||||
scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, server->other_freed,
|
||||
<.meta_avail)) ?:
|
||||
(err_str = "empty data_avail",
|
||||
alloc_move_empty(sb, &super->data_alloc, <.data_avail)) ?:
|
||||
alloc_move_empty(sb, &super->data_alloc, <.data_avail, 100)) ?:
|
||||
(err_str = "empty data_freed",
|
||||
alloc_move_empty(sb, &super->data_alloc, <.data_freed));
|
||||
alloc_move_empty(sb, &super->data_alloc, <.data_freed, 100));
|
||||
mutex_unlock(&server->alloc_mutex);
|
||||
|
||||
/* the transaction is no longer open */
|
||||
lt.commit_trans_seq = lt.get_trans_seq;
|
||||
/* only finalize, allowing merging, once the allocators are fully freed */
|
||||
if (ret == 0) {
|
||||
/* the transaction is no longer open */
|
||||
lt.commit_trans_seq = lt.get_trans_seq;
|
||||
|
||||
/* the mount is no longer writing to the zones */
|
||||
zero_data_alloc_zone_bits(<);
|
||||
le64_add_cpu(<.flags, SCOUTFS_LOG_TREES_FINALIZED);
|
||||
lt.finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb));
|
||||
/* the mount is no longer writing to the zones */
|
||||
zero_data_alloc_zone_bits(<);
|
||||
le64_add_cpu(<.flags, SCOUTFS_LOG_TREES_FINALIZED);
|
||||
lt.finalize_seq = cpu_to_le64(scoutfs_server_next_seq(sb));
|
||||
}
|
||||
|
||||
err = scoutfs_btree_update(sb, &server->alloc, &server->wri,
|
||||
&super->logs_root, &key, <, sizeof(lt));
|
||||
@@ -1638,7 +1638,7 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
|
||||
out:
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
|
||||
if (ret < 0)
|
||||
if (ret < 0 && ret != -EINPROGRESS)
|
||||
scoutfs_err(sb, "server error %d reclaiming log trees for rid %016llx: %s",
|
||||
ret, rid, err_str);
|
||||
|
||||
@@ -3536,26 +3536,37 @@ struct farewell_request {
|
||||
* Reclaim all the resources for a mount which has gone away. It's sent
|
||||
* us a farewell promising to leave or we actively fenced it.
|
||||
*
|
||||
* It's safe to call this multiple times for a given rid. Each
|
||||
* individual action knows to recognize that it's already been performed
|
||||
* and return success.
|
||||
* This can be called multiple times across different servers for
|
||||
* different reclaim attempts. The existence of the mounted_client item
|
||||
* triggers reclaim and must be deleted last. Each step knows that it
|
||||
* can be called multiple times and safely recognizes that its work
|
||||
* might have already been done.
|
||||
*
|
||||
* Some steps (reclaiming large fragmented allocators) may need multiple
|
||||
* calls to complete. They return -EINPROGRESS which tells us to apply
|
||||
* the server commit and retry.
|
||||
*/
|
||||
static int reclaim_rid(struct super_block *sb, u64 rid)
|
||||
{
|
||||
COMMIT_HOLD(hold);
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
server_hold_commit(sb, &hold);
|
||||
do {
|
||||
server_hold_commit(sb, &hold);
|
||||
|
||||
/* delete mounted client last, recovery looks for it */
|
||||
ret = scoutfs_lock_server_farewell(sb, rid) ?:
|
||||
reclaim_open_log_tree(sb, rid) ?:
|
||||
cancel_srch_compact(sb, rid) ?:
|
||||
cancel_log_merge(sb, rid) ?:
|
||||
scoutfs_omap_remove_rid(sb, rid) ?:
|
||||
delete_mounted_client(sb, rid);
|
||||
err = scoutfs_lock_server_farewell(sb, rid) ?:
|
||||
reclaim_open_log_tree(sb, rid) ?:
|
||||
cancel_srch_compact(sb, rid) ?:
|
||||
cancel_log_merge(sb, rid) ?:
|
||||
scoutfs_omap_remove_rid(sb, rid) ?:
|
||||
delete_mounted_client(sb, rid);
|
||||
|
||||
return server_apply_commit(sb, &hold, ret);
|
||||
ret = server_apply_commit(sb, &hold, err == -EINPROGRESS ? 0 : err);
|
||||
|
||||
} while (err == -EINPROGRESS && ret == 0);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
Reference in New Issue
Block a user