Compare commits

..

3 Commits

Author SHA1 Message Date
Auke Kok
4e2c1e83be Optionally print out xattr values
We cannot validate that totl keys have the correct count/value without
also extracting and printing the value for xattrs, which by default
is omitted (a sane default). Add a default-disabled --xattr-values/-V
flag to enable printing these out.

Because xattr values can span multiple items, this only will print
out the first one, and ellipsize it if it continues elsewhere. It is
filtered through isprint() to avoid printing non-printable characters.

Signed-off-by: Auke Kok <auke.kok@versity.com>
2026-04-16 11:26:29 -07:00
Auke Kok
8c4e9bfa3e Add print filters for remaining types.
Chris already added print filters for most common types, but
omitted totl, indx, orphan, quota, and inode_index items. This
adds those as well, completing the set.

Signed-off-by: Auke Kok <auke.kok@versity.com>
2026-04-16 11:01:47 -07:00
Chris Kirby
91638191de Add finer grained options to scoutfs print
The default output from scoutfs print can be very large, even
when using the -S option. Add three new command line options
to allow more targeted selection of btrees and their items.

--allocs prints the metadata and data allocators
--roots allows the selection of btree roots to walk (logs, srch, fs)
--items allows the selection of items to print from the selected btrees

Signed-off-by: Chris Kirby <ckirby@versity.com>
2025-11-21 10:39:58 -06:00
28 changed files with 677 additions and 875 deletions

View File

@@ -1,39 +1,6 @@
Versity ScoutFS Release Notes
=============================
---
v1.27
\
*Jan 15, 2026*
Switch away from using the general VM cache reclaim machinery to reduce
idle cluster locks in the client. The VM treated locks like a cache and
let many accumulate, presuming that it would be efficient to free them
in batches. Lock freeing requires network communication so this could
result in enormous backlogs in network messages (on the order of
hundreds of thousands) and could result in signifcant delays of other
network messaging.
Fix inefficient network receive processing while many messages are in
the send queue. This consumed sufficient CPU to cause significant
stalls, perhaps resulting in hung task warning messages due to delayed
lock message delivery.
Fix a server livelock case that could happen while committing client
transactions that contain a large amount of freed file data extents.
This would present as client tasks hanging and a server task spinning
consuming cpu.
Fix a rare server request processing failure that doesn't deal with
retransmission of a request that a previous server partially processed.
This would present as hung client tasks and repeated "error -2
committing log merge: getting merge status item" kernel messages.
Fix an unneccessary server shutdown during specific circumstances in
client lock recovery. The shutdown was due to server state and was
ultimately harmless. The next server that started up would proceed
accordingly.
---
v1.26
\

View File

@@ -125,6 +125,7 @@
EXPAND_COUNTER(item_update) \
EXPAND_COUNTER(item_write_dirty) \
EXPAND_COUNTER(lock_alloc) \
EXPAND_COUNTER(lock_count_objects) \
EXPAND_COUNTER(lock_free) \
EXPAND_COUNTER(lock_grant_request) \
EXPAND_COUNTER(lock_grant_response) \
@@ -138,13 +139,13 @@
EXPAND_COUNTER(lock_lock_error) \
EXPAND_COUNTER(lock_nonblock_eagain) \
EXPAND_COUNTER(lock_recover_request) \
EXPAND_COUNTER(lock_scan_objects) \
EXPAND_COUNTER(lock_shrink_attempted) \
EXPAND_COUNTER(lock_shrink_request_failed) \
EXPAND_COUNTER(lock_shrink_aborted) \
EXPAND_COUNTER(lock_shrink_work) \
EXPAND_COUNTER(lock_unlock) \
EXPAND_COUNTER(lock_wait) \
EXPAND_COUNTER(log_merge_complete) \
EXPAND_COUNTER(log_merge_no_finalized) \
EXPAND_COUNTER(log_merge_start) \
EXPAND_COUNTER(log_merge_wait_timeout) \
EXPAND_COUNTER(net_dropped_response) \
EXPAND_COUNTER(net_send_bytes) \
@@ -159,7 +160,6 @@
EXPAND_COUNTER(orphan_scan) \
EXPAND_COUNTER(orphan_scan_attempts) \
EXPAND_COUNTER(orphan_scan_cached) \
EXPAND_COUNTER(orphan_scan_empty) \
EXPAND_COUNTER(orphan_scan_error) \
EXPAND_COUNTER(orphan_scan_item) \
EXPAND_COUNTER(orphan_scan_omap_set) \

View File

@@ -1637,14 +1637,10 @@ int scoutfs_inode_orphan_delete(struct super_block *sb, u64 ino, struct scoutfs_
struct scoutfs_lock *primary)
{
struct scoutfs_key key;
int ret;
init_orphan_key(&key, ino);
ret = scoutfs_item_delete_force(sb, &key, lock, primary);
trace_scoutfs_inode_orphan_delete(sb, ino, ret);
return ret;
return scoutfs_item_delete_force(sb, &key, lock, primary);
}
/*
@@ -1726,8 +1722,6 @@ out:
scoutfs_release_trans(sb);
scoutfs_inode_index_unlock(sb, &ind_locks);
trace_scoutfs_delete_inode_end(sb, ino, mode, size, ret);
return ret;
}
@@ -1823,9 +1817,6 @@ out:
* they've checked that the inode could really be deleted. We serialize
* on a bit in the lock data so that we only have one deletion attempt
* per inode under this mount's cluster lock.
*
* Returns -EAGAIN if we either did some cleanup work or are unable to finish
* cleaning up this inode right now.
*/
static int try_delete_inode_items(struct super_block *sb, u64 ino)
{
@@ -1839,8 +1830,6 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
int bit_nr;
int ret;
trace_scoutfs_try_delete(sb, ino);
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_WRITE, 0, ino, &lock);
if (ret < 0)
goto out;
@@ -1853,32 +1842,27 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
/* only one local attempt per inode at a time */
if (test_and_set_bit(bit_nr, ldata->trying)) {
trace_scoutfs_try_delete_local_busy(sb, ino);
ret = -EAGAIN;
ret = 0;
goto out;
}
clear_trying = true;
/* can't delete if it's cached in local or remote mounts */
if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) {
trace_scoutfs_try_delete_cached(sb, ino);
ret = -EAGAIN;
ret = 0;
goto out;
}
scoutfs_inode_init_key(&key, ino);
ret = lookup_inode_item(sb, &key, &sinode, lock);
if (ret < 0) {
if (ret == -ENOENT) {
trace_scoutfs_try_delete_no_item(sb, ino);
if (ret == -ENOENT)
ret = 0;
}
goto out;
}
if (le32_to_cpu(sinode.nlink) > 0) {
trace_scoutfs_try_delete_has_links(sb, ino, le32_to_cpu(sinode.nlink));
ret = -EAGAIN;
ret = 0;
goto out;
}
@@ -1887,10 +1871,8 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
goto out;
ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
if (ret == 0) {
ret = -EAGAIN;
if (ret == 0)
scoutfs_inc_counter(sb, inode_deleted);
}
out:
if (clear_trying)
@@ -2092,10 +2074,6 @@ void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb)
* a locally cached inode. Then we ask the server for the open map
* containing the inode. Only if we don't see any cached users do we do
* the expensive work of acquiring locks to try and delete the items.
*
* We need to track whether there is any orphan cleanup work remaining so
* that tests such as inode-deletion can watch the orphan_scan_empty counter
* to determine when inode cleanup from open-unlink scenarios is complete.
*/
static void inode_orphan_scan_worker(struct work_struct *work)
{
@@ -2107,14 +2085,11 @@ static void inode_orphan_scan_worker(struct work_struct *work)
SCOUTFS_BTREE_ITEM_REF(iref);
struct scoutfs_key last;
struct scoutfs_key key;
bool work_todo = false;
u64 group_nr;
int bit_nr;
u64 ino;
int ret;
trace_scoutfs_orphan_scan_start(sb);
scoutfs_inc_counter(sb, orphan_scan);
init_orphan_key(&last, U64_MAX);
@@ -2134,10 +2109,8 @@ static void inode_orphan_scan_worker(struct work_struct *work)
init_orphan_key(&key, ino);
ret = scoutfs_btree_next(sb, &roots.fs_root, &key, &iref);
if (ret < 0) {
if (ret == -ENOENT) {
trace_scoutfs_orphan_scan_work(sb, 0);
if (ret == -ENOENT)
break;
}
goto out;
}
@@ -2152,7 +2125,6 @@ static void inode_orphan_scan_worker(struct work_struct *work)
/* locally cached inodes will try to delete as they evict */
if (scoutfs_omap_test(sb, ino)) {
work_todo = true;
scoutfs_inc_counter(sb, orphan_scan_cached);
continue;
}
@@ -2168,22 +2140,13 @@ static void inode_orphan_scan_worker(struct work_struct *work)
/* remote cached inodes will also try to delete */
if (test_bit_le(bit_nr, omap.bits)) {
work_todo = true;
scoutfs_inc_counter(sb, orphan_scan_omap_set);
continue;
}
/* seemingly orphaned and unused, get locks and check for sure */
scoutfs_inc_counter(sb, orphan_scan_attempts);
trace_scoutfs_orphan_scan_work(sb, ino);
ret = try_delete_inode_items(sb, ino);
if (ret == -EAGAIN) {
work_todo = true;
ret = 0;
}
trace_scoutfs_orphan_scan_end(sb, ino, ret);
}
ret = 0;
@@ -2192,11 +2155,6 @@ out:
if (ret < 0)
scoutfs_inc_counter(sb, orphan_scan_error);
if (!work_todo)
scoutfs_inc_counter(sb, orphan_scan_empty);
trace_scoutfs_orphan_scan_stop(sb, work_todo);
scoutfs_inode_schedule_orphan_dwork(sb);
}

View File

@@ -53,10 +53,8 @@
* all access to the lock (by revoking it down to a null mode) then the
* lock is freed.
*
* Each client has a configurable number of locks that are allowed to
* remain idle after being granted, for use by future tasks. Past the
* limit locks are freed by requesting a null mode from the server,
* governed by a LRU.
* Memory pressure on the client can cause the client to request a null
* mode from the server so that once its granted the lock can be freed.
*
* So far we've only needed a minimal trylock. We return -EAGAIN if a
* lock attempt can't immediately match an existing granted lock. This
@@ -81,11 +79,14 @@ struct lock_info {
bool unmounting;
struct rb_root lock_tree;
struct rb_root lock_range_tree;
u64 nr_locks;
KC_DEFINE_SHRINKER(shrinker);
struct list_head lru_list;
unsigned long long lru_nr;
struct workqueue_struct *workq;
struct work_struct inv_work;
struct list_head inv_list;
struct work_struct shrink_work;
struct list_head shrink_list;
atomic64_t next_refresh_gen;
struct dentry *tseq_dentry;
@@ -248,6 +249,7 @@ static void lock_free(struct lock_info *linfo, struct scoutfs_lock *lock)
BUG_ON(!RB_EMPTY_NODE(&lock->range_node));
BUG_ON(!list_empty(&lock->lru_head));
BUG_ON(!list_empty(&lock->inv_head));
BUG_ON(!list_empty(&lock->shrink_head));
BUG_ON(!list_empty(&lock->cov_list));
kfree(lock->inode_deletion_data);
@@ -275,6 +277,7 @@ static struct scoutfs_lock *lock_alloc(struct super_block *sb,
INIT_LIST_HEAD(&lock->lru_head);
INIT_LIST_HEAD(&lock->inv_head);
INIT_LIST_HEAD(&lock->inv_list);
INIT_LIST_HEAD(&lock->shrink_head);
spin_lock_init(&lock->cov_list_lock);
INIT_LIST_HEAD(&lock->cov_list);
@@ -407,7 +410,6 @@ static bool lock_insert(struct super_block *sb, struct scoutfs_lock *ins)
rb_link_node(&ins->node, parent, node);
rb_insert_color(&ins->node, &linfo->lock_tree);
linfo->nr_locks++;
scoutfs_tseq_add(&linfo->tseq_tree, &ins->tseq_entry);
return true;
@@ -422,7 +424,6 @@ static void lock_remove(struct lock_info *linfo, struct scoutfs_lock *lock)
rb_erase(&lock->range_node, &linfo->lock_range_tree);
RB_CLEAR_NODE(&lock->range_node);
linfo->nr_locks--;
scoutfs_tseq_del(&linfo->tseq_tree, &lock->tseq_entry);
}
@@ -462,8 +463,10 @@ static void __lock_del_lru(struct lock_info *linfo, struct scoutfs_lock *lock)
{
assert_spin_locked(&linfo->lock);
if (!list_empty(&lock->lru_head))
if (!list_empty(&lock->lru_head)) {
list_del_init(&lock->lru_head);
linfo->lru_nr--;
}
}
/*
@@ -522,16 +525,14 @@ static struct scoutfs_lock *create_lock(struct super_block *sb,
* indicate that the lock wasn't idle. If it really is idle then we
* either free it if it's null or put it back on the lru.
*/
static void __put_lock(struct lock_info *linfo, struct scoutfs_lock *lock, bool tail)
static void put_lock(struct lock_info *linfo,struct scoutfs_lock *lock)
{
assert_spin_locked(&linfo->lock);
if (lock_idle(lock)) {
if (lock->mode != SCOUTFS_LOCK_NULL) {
if (tail)
list_add_tail(&lock->lru_head, &linfo->lru_list);
else
list_add(&lock->lru_head, &linfo->lru_list);
list_add_tail(&lock->lru_head, &linfo->lru_list);
linfo->lru_nr++;
} else {
lock_remove(linfo, lock);
lock_free(linfo, lock);
@@ -539,11 +540,6 @@ static void __put_lock(struct lock_info *linfo, struct scoutfs_lock *lock, bool
}
}
static inline void put_lock(struct lock_info *linfo, struct scoutfs_lock *lock)
{
__put_lock(linfo, lock, true);
}
/*
* The caller has made a change (set a lock mode) which can let one of the
* invalidating locks make forward progress.
@@ -717,14 +713,14 @@ static void lock_invalidate_worker(struct work_struct *work)
/* only lock protocol, inv can't call subsystems after shutdown */
if (!linfo->shutdown) {
ret = lock_invalidate(sb, lock, nl->old_mode, nl->new_mode);
BUG_ON(ret < 0 && ret != -ENOLINK);
BUG_ON(ret);
}
/* respond with the key and modes from the request, server might have died */
ret = scoutfs_client_lock_response(sb, ireq->net_id, nl);
if (ret == -ENOTCONN)
ret = 0;
BUG_ON(ret < 0 && ret != -ENOLINK);
BUG_ON(ret);
scoutfs_inc_counter(sb, lock_invalidate_response);
}
@@ -879,69 +875,6 @@ int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
return ret;
}
/*
* This is called on every _lock call to try and keep the number of
* locks under the idle count. We're intentionally trying to throttle
* shrinking bursts by tying its frequency to lock use. It will only
* send requests to free unused locks, though, so it's always possible
* to exceed the high water mark under heavy load.
*
* We send a null request and the lock will be freed by the response
* once all users drain. If this races with invalidation then the
* server will only send the grant response once the invalidation is
* finished.
*/
static bool try_shrink_lock(struct super_block *sb, struct lock_info *linfo, bool force)
{
struct scoutfs_mount_options opts;
struct scoutfs_lock *lock = NULL;
struct scoutfs_net_lock nl;
int ret = 0;
scoutfs_options_read(sb, &opts);
/* avoiding lock contention with unsynchronized test, don't mind temp false results */
if (!force && (list_empty(&linfo->lru_list) ||
READ_ONCE(linfo->nr_locks) <= opts.lock_idle_count))
return false;
spin_lock(&linfo->lock);
lock = list_first_entry_or_null(&linfo->lru_list, struct scoutfs_lock, lru_head);
if (lock && (force || (linfo->nr_locks > opts.lock_idle_count))) {
__lock_del_lru(linfo, lock);
lock->request_pending = 1;
nl.key = lock->start;
nl.old_mode = lock->mode;
nl.new_mode = SCOUTFS_LOCK_NULL;
} else {
lock = NULL;
}
spin_unlock(&linfo->lock);
if (lock) {
ret = scoutfs_client_lock_request(sb, &nl);
if (ret < 0) {
scoutfs_inc_counter(sb, lock_shrink_request_failed);
spin_lock(&linfo->lock);
lock->request_pending = 0;
wake_up(&lock->waitq);
__put_lock(linfo, lock, false);
spin_unlock(&linfo->lock);
} else {
scoutfs_inc_counter(sb, lock_shrink_attempted);
trace_scoutfs_lock_shrink(sb, lock);
}
}
return lock && ret == 0;
}
static bool lock_wait_cond(struct super_block *sb, struct scoutfs_lock *lock,
enum scoutfs_lock_mode mode)
{
@@ -1004,8 +937,6 @@ static int lock_key_range(struct super_block *sb, enum scoutfs_lock_mode mode, i
if (WARN_ON_ONCE(scoutfs_trans_held()))
return -EDEADLK;
try_shrink_lock(sb, linfo, false);
spin_lock(&linfo->lock);
/* drops and re-acquires lock if it allocates */
@@ -1449,12 +1380,134 @@ bool scoutfs_lock_protected(struct scoutfs_lock *lock, struct scoutfs_key *key,
&lock->start, &lock->end) == 0;
}
/*
* The shrink callback got the lock, marked it request_pending, and put
* it on the shrink list. We send a null request and the lock will be
* freed by the response once all users drain. If this races with
* invalidation then the server will only send the grant response once
* the invalidation is finished.
*/
static void lock_shrink_worker(struct work_struct *work)
{
struct lock_info *linfo = container_of(work, struct lock_info,
shrink_work);
struct super_block *sb = linfo->sb;
struct scoutfs_net_lock nl;
struct scoutfs_lock *lock;
struct scoutfs_lock *tmp;
LIST_HEAD(list);
int ret;
scoutfs_inc_counter(sb, lock_shrink_work);
spin_lock(&linfo->lock);
list_splice_init(&linfo->shrink_list, &list);
spin_unlock(&linfo->lock);
list_for_each_entry_safe(lock, tmp, &list, shrink_head) {
list_del_init(&lock->shrink_head);
/* unlocked lock access, but should be stable since we queued */
nl.key = lock->start;
nl.old_mode = lock->mode;
nl.new_mode = SCOUTFS_LOCK_NULL;
ret = scoutfs_client_lock_request(sb, &nl);
if (ret) {
/* oh well, not freeing */
scoutfs_inc_counter(sb, lock_shrink_aborted);
spin_lock(&linfo->lock);
lock->request_pending = 0;
wake_up(&lock->waitq);
put_lock(linfo, lock);
spin_unlock(&linfo->lock);
}
}
}
static unsigned long lock_count_objects(struct shrinker *shrink,
struct shrink_control *sc)
{
struct lock_info *linfo = KC_SHRINKER_CONTAINER_OF(shrink, struct lock_info);
struct super_block *sb = linfo->sb;
scoutfs_inc_counter(sb, lock_count_objects);
return shrinker_min_long(linfo->lru_nr);
}
/*
* Start the shrinking process for locks on the lru. If a lock is on
* the lru then it can't have any active users. We don't want to block
* or allocate here so all we do is get the lock, mark it request
* pending, and kick off the work. The work sends a null request and
* eventually the lock is freed by its response.
*
* Only a racing lock attempt that isn't matched can prevent the lock
* from being freed. It'll block waiting to send its request for its
* mode which will prevent the lock from being freed when the null
* response arrives.
*/
static unsigned long lock_scan_objects(struct shrinker *shrink,
struct shrink_control *sc)
{
struct lock_info *linfo = KC_SHRINKER_CONTAINER_OF(shrink, struct lock_info);
struct super_block *sb = linfo->sb;
struct scoutfs_lock *lock;
struct scoutfs_lock *tmp;
unsigned long freed = 0;
unsigned long nr = sc->nr_to_scan;
bool added = false;
scoutfs_inc_counter(sb, lock_scan_objects);
spin_lock(&linfo->lock);
restart:
list_for_each_entry_safe(lock, tmp, &linfo->lru_list, lru_head) {
BUG_ON(!lock_idle(lock));
BUG_ON(lock->mode == SCOUTFS_LOCK_NULL);
BUG_ON(!list_empty(&lock->shrink_head));
if (nr-- == 0)
break;
__lock_del_lru(linfo, lock);
lock->request_pending = 1;
list_add_tail(&lock->shrink_head, &linfo->shrink_list);
added = true;
freed++;
scoutfs_inc_counter(sb, lock_shrink_attempted);
trace_scoutfs_lock_shrink(sb, lock);
/* could have bazillions of idle locks */
if (cond_resched_lock(&linfo->lock))
goto restart;
}
spin_unlock(&linfo->lock);
if (added)
queue_work(linfo->workq, &linfo->shrink_work);
trace_scoutfs_lock_shrink_exit(sb, sc->nr_to_scan, freed);
return freed;
}
void scoutfs_free_unused_locks(struct super_block *sb)
{
DECLARE_LOCK_INFO(sb, linfo);
struct lock_info *linfo = SCOUTFS_SB(sb)->lock_info;
struct shrink_control sc = {
.gfp_mask = GFP_NOFS,
.nr_to_scan = INT_MAX,
};
while (try_shrink_lock(sb, linfo, true))
cond_resched();
lock_scan_objects(KC_SHRINKER_FN(&linfo->shrinker), &sc);
}
static void lock_tseq_show(struct seq_file *m, struct scoutfs_tseq_entry *ent)
@@ -1537,10 +1590,10 @@ u64 scoutfs_lock_ino_refresh_gen(struct super_block *sb, u64 ino)
* transitions and sending requests. We set the shutdown flag to catch
* anyone who breaks this rule.
*
* With no more lock callers, we'll no longer try to shrink the pool of
* granted locks. We'll free all of them as _destroy() is called after
* the farewell response indicates that the server tore down all our
* lock state.
* We unregister the shrinker so that we won't try and send null
* requests in response to memory pressure. The locks will all be
* unceremoniously dropped once we get a farewell response from the
* server which indicates that they destroyed our locking state.
*
* We will still respond to invalidation requests that have to be
* processed to let unmount in other mounts acquire locks and make
@@ -1560,6 +1613,10 @@ void scoutfs_lock_shutdown(struct super_block *sb)
trace_scoutfs_lock_shutdown(sb, linfo);
/* stop the shrinker from queueing work */
KC_UNREGISTER_SHRINKER(&linfo->shrinker);
flush_work(&linfo->shrink_work);
/* cause current and future lock calls to return errors */
spin_lock(&linfo->lock);
linfo->shutdown = true;
@@ -1650,6 +1707,8 @@ void scoutfs_lock_destroy(struct super_block *sb)
list_del_init(&lock->inv_head);
lock->invalidate_pending = 0;
}
if (!list_empty(&lock->shrink_head))
list_del_init(&lock->shrink_head);
lock_remove(linfo, lock);
lock_free(linfo, lock);
}
@@ -1674,9 +1733,14 @@ int scoutfs_lock_setup(struct super_block *sb)
spin_lock_init(&linfo->lock);
linfo->lock_tree = RB_ROOT;
linfo->lock_range_tree = RB_ROOT;
KC_INIT_SHRINKER_FUNCS(&linfo->shrinker, lock_count_objects,
lock_scan_objects);
KC_REGISTER_SHRINKER(&linfo->shrinker, "scoutfs-lock:" SCSBF, SCSB_ARGS(sb));
INIT_LIST_HEAD(&linfo->lru_list);
INIT_WORK(&linfo->inv_work, lock_invalidate_worker);
INIT_LIST_HEAD(&linfo->inv_list);
INIT_WORK(&linfo->shrink_work, lock_shrink_worker);
INIT_LIST_HEAD(&linfo->shrink_list);
atomic64_set(&linfo->next_refresh_gen, 0);
scoutfs_tseq_tree_init(&linfo->tseq_tree, lock_tseq_show);

View File

@@ -506,19 +506,6 @@ out:
* because we don't know which locks they'll hold. Once recover
* finishes the server calls us to kick all the locks that were waiting
* during recovery.
*
* The calling server shuts down if we return errors indicating that we
* weren't able to ensure forward progress in the lock state machine.
*
* Failure to send to a disconnected client is not a fatal error.
* During normal disconnection the client's state is removed before
* their connection is destroyed. We can't use state to try and send to
* a non-existing connection. But a client that fails to reconnect is
* disconnected before being fenced. If we have multiple disconnected
* clients we can try to send to one while cleaning up another. If
* they've uncleanly disconnected their locks are going to be removed
* and the lock can make forward progress again. Or we'll shutdown for
* failure to fence.
*/
static int process_waiting_requests(struct super_block *sb,
struct server_lock_node *snode)
@@ -610,10 +597,6 @@ static int process_waiting_requests(struct super_block *sb,
out:
put_server_lock(inf, snode);
/* disconnected clients will be fenced, trying to send to them isn't fatal */
if (ret == -ENOTCONN)
ret = 0;
return ret;
}

View File

@@ -21,7 +21,6 @@
#include <net/tcp.h>
#include <linux/log2.h>
#include <linux/jhash.h>
#include <linux/rbtree.h>
#include "format.h"
#include "counters.h"
@@ -126,7 +125,6 @@ struct message_send {
unsigned long dead:1;
struct list_head head;
scoutfs_net_response_t resp_func;
struct rb_node node;
void *resp_data;
struct scoutfs_net_header nh;
};
@@ -163,118 +161,49 @@ static bool nh_is_request(struct scoutfs_net_header *nh)
return !nh_is_response(nh);
}
static int cmp_sorted_msend(u64 pos, struct message_send *msend)
{
if (nh_is_request(&msend->nh))
return pos < le64_to_cpu(msend->nh.id) ? -1 :
pos > le64_to_cpu(msend->nh.id) ? 1 : 0;
else
return pos < le64_to_cpu(msend->nh.seq) ? -1 :
pos > le64_to_cpu(msend->nh.seq) ? 1 : 0;
}
static struct message_send *search_sorted_msends(struct rb_root *root, u64 pos, struct rb_node *ins)
{
struct rb_node **node = &root->rb_node;
struct rb_node *parent = NULL;
struct message_send *msend = NULL;
struct message_send *next = NULL;
int cmp = -1;
while (*node) {
parent = *node;
msend = container_of(*node, struct message_send, node);
cmp = cmp_sorted_msend(pos, msend);
if (cmp < 0) {
next = msend;
node = &(*node)->rb_left;
} else if (cmp > 0) {
node = &(*node)->rb_right;
} else {
next = msend;
break;
}
}
BUG_ON(cmp == 0 && ins);
if (ins) {
rb_link_node(ins, parent, node);
rb_insert_color(ins, root);
}
return next;
}
static struct message_send *next_sorted_msend(struct message_send *msend)
{
struct rb_node *node = rb_next(&msend->node);
return node ? rb_entry(node, struct message_send, node) : NULL;
}
#define for_each_sorted_msend(MSEND_, TMP_, ROOT_, POS_) \
for (MSEND_ = search_sorted_msends(ROOT_, POS_, NULL); \
MSEND_ != NULL && ({ TMP_ = next_sorted_msend(MSEND_); true; }); \
MSEND_ = TMP_)
static void insert_sorted_msend(struct scoutfs_net_connection *conn, struct message_send *msend)
{
BUG_ON(!RB_EMPTY_NODE(&msend->node));
if (nh_is_request(&msend->nh))
search_sorted_msends(&conn->req_root, le64_to_cpu(msend->nh.id), &msend->node);
else
search_sorted_msends(&conn->resp_root, le64_to_cpu(msend->nh.seq), &msend->node);
}
static void erase_sorted_msend(struct scoutfs_net_connection *conn, struct message_send *msend)
{
if (!RB_EMPTY_NODE(&msend->node)) {
if (nh_is_request(&msend->nh))
rb_erase(&msend->node, &conn->req_root);
else
rb_erase(&msend->node, &conn->resp_root);
RB_CLEAR_NODE(&msend->node);
}
}
static void move_sorted_msends(struct scoutfs_net_connection *dst_conn, struct rb_root *dst_root,
struct scoutfs_net_connection *src_conn, struct rb_root *src_root)
{
struct message_send *msend;
struct message_send *tmp;
for_each_sorted_msend(msend, tmp, src_root, 0) {
erase_sorted_msend(src_conn, msend);
insert_sorted_msend(dst_conn, msend);
}
}
/*
* Pending requests are uniquely identified by the id they were assigned
* as they were first put on the send queue.
* We return dead requests so that the caller can stop searching other
* lists for the dead request that we found.
*/
static struct message_send *find_request(struct scoutfs_net_connection *conn, u8 cmd, u64 id)
static struct message_send *search_list(struct scoutfs_net_connection *conn,
struct list_head *list,
u8 cmd, u64 id)
{
struct message_send *msend;
assert_spin_locked(&conn->lock);
msend = search_sorted_msends(&conn->req_root, id, NULL);
if (msend && !(msend->nh.cmd == cmd && le64_to_cpu(msend->nh.id) == id))
msend = NULL;
list_for_each_entry(msend, list, head) {
if (nh_is_request(&msend->nh) && msend->nh.cmd == cmd &&
le64_to_cpu(msend->nh.id) == id)
return msend;
}
return NULL;
}
/*
* Find an active send request on the lists. It's almost certainly
* waiting on the resend queue but it could be actively being sent.
*/
static struct message_send *find_request(struct scoutfs_net_connection *conn,
u8 cmd, u64 id)
{
struct message_send *msend;
msend = search_list(conn, &conn->resend_queue, cmd, id) ?:
search_list(conn, &conn->send_queue, cmd, id);
if (msend && msend->dead)
msend = NULL;
return msend;
}
/*
* Free a send message by moving it to the send queue and marking it
* dead. It is removed from the sorted rb roots so it won't be visible
* as a request for response processing.
* Complete a send message by moving it to the send queue and marking it
* to be freed. It won't be visible to callers trying to find sends.
*/
static void queue_dead_free(struct scoutfs_net_connection *conn, struct message_send *msend)
static void complete_send(struct scoutfs_net_connection *conn,
struct message_send *msend)
{
assert_spin_locked(&conn->lock);
@@ -284,7 +213,6 @@ static void queue_dead_free(struct scoutfs_net_connection *conn, struct message_
msend->dead = 1;
list_move(&msend->head, &conn->send_queue);
erase_sorted_msend(conn, msend);
queue_work(conn->workq, &conn->send_work);
}
@@ -442,7 +370,6 @@ static int submit_send(struct super_block *sb,
msend->resp_func = resp_func;
msend->resp_data = resp_data;
msend->dead = 0;
RB_CLEAR_NODE(&msend->node);
msend->nh.seq = cpu_to_le64(seq);
msend->nh.recv_seq = 0; /* set when sent, not when queued */
@@ -463,7 +390,6 @@ static int submit_send(struct super_block *sb,
} else {
list_add_tail(&msend->head, &conn->resend_queue);
}
insert_sorted_msend(conn, msend);
if (id_ret)
*id_ret = le64_to_cpu(msend->nh.id);
@@ -533,7 +459,7 @@ static int process_response(struct scoutfs_net_connection *conn,
if (msend) {
resp_func = msend->resp_func;
resp_data = msend->resp_data;
queue_dead_free(conn, msend);
complete_send(conn, msend);
} else {
scoutfs_inc_counter(sb, net_dropped_response);
}
@@ -624,21 +550,43 @@ static void queue_ordered_proc(struct scoutfs_net_connection *conn, struct messa
* Free live responses up to and including the seq by marking them dead
* and moving them to the send queue to be freed.
*/
static void free_acked_responses(struct scoutfs_net_connection *conn, u64 seq)
static bool move_acked_responses(struct scoutfs_net_connection *conn,
struct list_head *list, u64 seq)
{
struct message_send *msend;
struct message_send *tmp;
bool moved = false;
assert_spin_locked(&conn->lock);
list_for_each_entry_safe(msend, tmp, list, head) {
if (le64_to_cpu(msend->nh.seq) > seq)
break;
if (!nh_is_response(&msend->nh) || msend->dead)
continue;
msend->dead = 1;
list_move(&msend->head, &conn->send_queue);
moved = true;
}
return moved;
}
/* acks are processed inline in the recv worker */
static void free_acked_responses(struct scoutfs_net_connection *conn, u64 seq)
{
bool moved;
spin_lock(&conn->lock);
for_each_sorted_msend(msend, tmp, &conn->resp_root, 0) {
if (le64_to_cpu(msend->nh.seq) > seq)
break;
queue_dead_free(conn, msend);
}
moved = move_acked_responses(conn, &conn->send_queue, seq) |
move_acked_responses(conn, &conn->resend_queue, seq);
spin_unlock(&conn->lock);
if (moved)
queue_work(conn->workq, &conn->send_work);
}
static int k_recvmsg(struct socket *sock, void *buf, unsigned len)
@@ -876,11 +824,9 @@ static int k_sendmsg_full(struct socket *sock, struct kvec *kv, unsigned long nr
return ret;
}
static void free_msend(struct net_info *ninf, struct scoutfs_net_connection *conn,
struct message_send *msend)
static void free_msend(struct net_info *ninf, struct message_send *msend)
{
list_del_init(&msend->head);
erase_sorted_msend(conn, msend);
scoutfs_tseq_del(&ninf->msg_tseq_tree, &msend->tseq_entry);
kfree(msend);
}
@@ -920,10 +866,9 @@ static void scoutfs_net_send_worker(struct work_struct *work)
count = 0;
spin_lock(&conn->lock);
list_for_each_entry_safe(msend, _msend_, &conn->send_queue, head) {
if (msend->dead) {
free_msend(ninf, conn, msend);
free_msend(ninf, msend);
continue;
}
@@ -1012,7 +957,7 @@ static void scoutfs_net_destroy_worker(struct work_struct *work)
list_splice_init(&conn->resend_queue, &conn->send_queue);
list_for_each_entry_safe(msend, tmp, &conn->send_queue, head)
free_msend(ninf, conn, msend);
free_msend(ninf, msend);
/* accepted sockets are removed from their listener's list */
if (conn->listening_conn) {
@@ -1358,7 +1303,7 @@ static void scoutfs_net_shutdown_worker(struct work_struct *work)
struct message_send, head))) {
resp_func = msend->resp_func;
resp_data = msend->resp_data;
free_msend(ninf, conn, msend);
free_msend(ninf, msend);
spin_unlock(&conn->lock);
call_resp_func(sb, conn, resp_func, resp_data, NULL, 0, -ECONNABORTED);
@@ -1374,7 +1319,7 @@ static void scoutfs_net_shutdown_worker(struct work_struct *work)
list_splice_tail_init(&conn->send_queue, &conn->resend_queue);
list_for_each_entry_safe(msend, tmp, &conn->resend_queue, head) {
if (msend->nh.cmd == SCOUTFS_NET_CMD_GREETING)
free_msend(ninf, conn, msend);
free_msend(ninf, msend);
}
clear_conn_fl(conn, saw_greeting);
@@ -1548,8 +1493,6 @@ scoutfs_net_alloc_conn(struct super_block *sb,
atomic64_set(&conn->recv_seq, 0);
INIT_LIST_HEAD(&conn->send_queue);
INIT_LIST_HEAD(&conn->resend_queue);
conn->req_root = RB_ROOT;
conn->resp_root = RB_ROOT;
INIT_WORK(&conn->listen_work, scoutfs_net_listen_worker);
INIT_WORK(&conn->connect_work, scoutfs_net_connect_worker);
INIT_WORK(&conn->send_work, scoutfs_net_send_worker);
@@ -1762,7 +1705,7 @@ void scoutfs_net_client_greeting(struct super_block *sb,
atomic64_set(&conn->recv_seq, 0);
list_for_each_entry_safe(msend, tmp, &conn->resend_queue, head){
if (nh_is_response(&msend->nh))
free_msend(ninf, conn, msend);
free_msend(ninf, msend);
}
}
@@ -1865,8 +1808,6 @@ restart:
BUG_ON(!list_empty(&reconn->send_queue));
/* queued greeting response is racing, can be in send or resend queue */
list_splice_tail_init(&reconn->resend_queue, &conn->resend_queue);
move_sorted_msends(conn, &conn->req_root, reconn, &reconn->req_root);
move_sorted_msends(conn, &conn->resp_root, reconn, &reconn->resp_root);
/* new conn info is unused, swap, old won't call down */
swap(conn->info, reconn->info);

View File

@@ -67,8 +67,6 @@ struct scoutfs_net_connection {
u64 next_send_id;
struct list_head send_queue;
struct list_head resend_queue;
struct rb_root req_root;
struct rb_root resp_root;
atomic64_t recv_seq;
unsigned int ordered_proc_nr;

View File

@@ -34,7 +34,6 @@ enum {
Opt_data_prealloc_blocks,
Opt_data_prealloc_contig_only,
Opt_ino_alloc_per_lock,
Opt_lock_idle_count,
Opt_log_merge_wait_timeout_ms,
Opt_metadev_path,
Opt_noacl,
@@ -50,7 +49,6 @@ static const match_table_t tokens = {
{Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"},
{Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"},
{Opt_ino_alloc_per_lock, "ino_alloc_per_lock=%s"},
{Opt_lock_idle_count, "lock_idle_count=%s"},
{Opt_log_merge_wait_timeout_ms, "log_merge_wait_timeout_ms=%s"},
{Opt_metadev_path, "metadev_path=%s"},
{Opt_noacl, "noacl"},
@@ -121,10 +119,6 @@ static void free_options(struct scoutfs_mount_options *opts)
kfree(opts->metadev_path);
}
#define MIN_LOCK_IDLE_COUNT 32
#define DEFAULT_LOCK_IDLE_COUNT (10 * 1000)
#define MAX_LOCK_IDLE_COUNT (100 * 1000)
#define MIN_LOG_MERGE_WAIT_TIMEOUT_MS 100UL
#define DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS 500
#define MAX_LOG_MERGE_WAIT_TIMEOUT_MS (60 * MSEC_PER_SEC)
@@ -145,7 +139,6 @@ static void init_default_options(struct scoutfs_mount_options *opts)
opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
opts->data_prealloc_contig_only = 1;
opts->ino_alloc_per_lock = SCOUTFS_LOCK_INODE_GROUP_NR;
opts->lock_idle_count = DEFAULT_LOCK_IDLE_COUNT;
opts->log_merge_wait_timeout_ms = DEFAULT_LOG_MERGE_WAIT_TIMEOUT_MS;
opts->orphan_scan_delay_ms = -1;
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
@@ -153,21 +146,6 @@ static void init_default_options(struct scoutfs_mount_options *opts)
opts->tcp_keepalive_timeout_ms = DEFAULT_TCP_KEEPALIVE_TIMEOUT_MS;
}
static int verify_lock_idle_count(struct super_block *sb, int ret, int val)
{
if (ret < 0) {
scoutfs_err(sb, "failed to parse lock_idle_count value");
return -EINVAL;
}
if (val < MIN_LOCK_IDLE_COUNT || val > MAX_LOCK_IDLE_COUNT) {
scoutfs_err(sb, "invalid lock_idle_count value %d, must be between %u and %u",
val, MIN_LOCK_IDLE_COUNT, MAX_LOCK_IDLE_COUNT);
return -EINVAL;
}
return 0;
}
static int verify_log_merge_wait_timeout_ms(struct super_block *sb, int ret, int val)
{
if (ret < 0) {
@@ -283,14 +261,6 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
opts->tcp_keepalive_timeout_ms = nr;
break;
case Opt_lock_idle_count:
ret = match_int(args, &nr);
ret = verify_lock_idle_count(sb, ret, nr);
if (ret < 0)
return ret;
opts->lock_idle_count = nr;
break;
case Opt_log_merge_wait_timeout_ms:
ret = match_int(args, &nr);
ret = verify_log_merge_wait_timeout_ms(sb, ret, nr);
@@ -566,43 +536,6 @@ static ssize_t ino_alloc_per_lock_store(struct kobject *kobj, struct kobj_attrib
}
SCOUTFS_ATTR_RW(ino_alloc_per_lock);
static ssize_t lock_idle_count_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
struct scoutfs_mount_options opts;
scoutfs_options_read(sb, &opts);
return snprintf(buf, PAGE_SIZE, "%u", opts.lock_idle_count);
}
static ssize_t lock_idle_count_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
DECLARE_OPTIONS_INFO(sb, optinf);
char nullterm[30]; /* more than enough for octal -U64_MAX */
int val;
int len;
int ret;
len = min(count, sizeof(nullterm) - 1);
memcpy(nullterm, buf, len);
nullterm[len] = '\0';
ret = kstrtoint(nullterm, 0, &val);
ret = verify_lock_idle_count(sb, ret, val);
if (ret == 0) {
write_seqlock(&optinf->seqlock);
optinf->opts.lock_idle_count = val;
write_sequnlock(&optinf->seqlock);
ret = count;
}
return ret;
}
SCOUTFS_ATTR_RW(lock_idle_count);
static ssize_t log_merge_wait_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -744,7 +677,6 @@ static struct attribute *options_attrs[] = {
SCOUTFS_ATTR_PTR(data_prealloc_blocks),
SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
SCOUTFS_ATTR_PTR(ino_alloc_per_lock),
SCOUTFS_ATTR_PTR(lock_idle_count),
SCOUTFS_ATTR_PTR(log_merge_wait_timeout_ms),
SCOUTFS_ATTR_PTR(metadev_path),
SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),

View File

@@ -9,7 +9,6 @@ struct scoutfs_mount_options {
u64 data_prealloc_blocks;
bool data_prealloc_contig_only;
unsigned int ino_alloc_per_lock;
int lock_idle_count;
unsigned int log_merge_wait_timeout_ms;
char *metadev_path;
unsigned int orphan_scan_delay_ms;

View File

@@ -789,80 +789,6 @@ TRACE_EVENT(scoutfs_inode_walk_writeback,
__entry->ino, __entry->write, __entry->ret)
);
TRACE_EVENT(scoutfs_orphan_scan_start,
TP_PROTO(struct super_block *sb),
TP_ARGS(sb),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
),
TP_printk(SCSBF, SCSB_TRACE_ARGS)
);
TRACE_EVENT(scoutfs_orphan_scan_stop,
TP_PROTO(struct super_block *sb, bool work_todo),
TP_ARGS(sb, work_todo),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(bool, work_todo)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->work_todo = work_todo;
),
TP_printk(SCSBF" work_todo %d", SCSB_TRACE_ARGS, __entry->work_todo)
);
TRACE_EVENT(scoutfs_orphan_scan_work,
TP_PROTO(struct super_block *sb, __u64 ino),
TP_ARGS(sb, ino),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, ino)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->ino = ino;
),
TP_printk(SCSBF" ino %llu", SCSB_TRACE_ARGS,
__entry->ino)
);
TRACE_EVENT(scoutfs_orphan_scan_end,
TP_PROTO(struct super_block *sb, __u64 ino, int ret),
TP_ARGS(sb, ino, ret),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, ino)
__field(int, ret)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->ino = ino;
__entry->ret = ret;
),
TP_printk(SCSBF" ino %llu ret %d", SCSB_TRACE_ARGS,
__entry->ino, __entry->ret)
);
DECLARE_EVENT_CLASS(scoutfs_lock_info_class,
TP_PROTO(struct super_block *sb, struct lock_info *linfo),
@@ -1110,82 +1036,6 @@ TRACE_EVENT(scoutfs_orphan_inode,
MINOR(__entry->dev), __entry->ino)
);
DECLARE_EVENT_CLASS(scoutfs_try_delete_class,
TP_PROTO(struct super_block *sb, u64 ino),
TP_ARGS(sb, ino),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, ino)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->ino = ino;
),
TP_printk(SCSBF" ino %llu", SCSB_TRACE_ARGS, __entry->ino)
);
DEFINE_EVENT(scoutfs_try_delete_class, scoutfs_try_delete,
TP_PROTO(struct super_block *sb, u64 ino),
TP_ARGS(sb, ino)
);
DEFINE_EVENT(scoutfs_try_delete_class, scoutfs_try_delete_local_busy,
TP_PROTO(struct super_block *sb, u64 ino),
TP_ARGS(sb, ino)
);
DEFINE_EVENT(scoutfs_try_delete_class, scoutfs_try_delete_cached,
TP_PROTO(struct super_block *sb, u64 ino),
TP_ARGS(sb, ino)
);
DEFINE_EVENT(scoutfs_try_delete_class, scoutfs_try_delete_no_item,
TP_PROTO(struct super_block *sb, u64 ino),
TP_ARGS(sb, ino)
);
TRACE_EVENT(scoutfs_try_delete_has_links,
TP_PROTO(struct super_block *sb, u64 ino, unsigned int nlink),
TP_ARGS(sb, ino, nlink),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, ino)
__field(unsigned int, nlink)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->ino = ino;
__entry->nlink = nlink;
),
TP_printk(SCSBF" ino %llu nlink %u", SCSB_TRACE_ARGS, __entry->ino,
__entry->nlink)
);
TRACE_EVENT(scoutfs_inode_orphan_delete,
TP_PROTO(struct super_block *sb, u64 ino, int ret),
TP_ARGS(sb, ino, ret),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, ino)
__field(int, ret)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->ino = ino;
__entry->ret = ret;
),
TP_printk(SCSBF" ino %llu ret %d", SCSB_TRACE_ARGS, __entry->ino,
__entry->ret)
);
TRACE_EVENT(scoutfs_delete_inode,
TP_PROTO(struct super_block *sb, u64 ino, umode_t mode, u64 size),
@@ -1210,32 +1060,6 @@ TRACE_EVENT(scoutfs_delete_inode,
__entry->mode, __entry->size)
);
TRACE_EVENT(scoutfs_delete_inode_end,
TP_PROTO(struct super_block *sb, u64 ino, umode_t mode, u64 size, int ret),
TP_ARGS(sb, ino, mode, size, ret),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(__u64, ino)
__field(umode_t, mode)
__field(__u64, size)
__field(int, ret)
),
TP_fast_assign(
__entry->dev = sb->s_dev;
__entry->ino = ino;
__entry->mode = mode;
__entry->size = size;
__entry->ret = ret;
),
TP_printk("dev %d,%d ino %llu, mode 0x%x size %llu, ret %d",
MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino,
__entry->mode, __entry->size, __entry->ret)
);
DECLARE_EVENT_CLASS(scoutfs_key_class,
TP_PROTO(struct super_block *sb, struct scoutfs_key *key),
TP_ARGS(sb, key),
@@ -1619,6 +1443,28 @@ DEFINE_EVENT(scoutfs_work_class, scoutfs_data_return_server_extents_exit,
TP_ARGS(sb, data, ret)
);
DECLARE_EVENT_CLASS(scoutfs_shrink_exit_class,
TP_PROTO(struct super_block *sb, unsigned long nr_to_scan, int ret),
TP_ARGS(sb, nr_to_scan, ret),
TP_STRUCT__entry(
__field(void *, sb)
__field(unsigned long, nr_to_scan)
__field(int, ret)
),
TP_fast_assign(
__entry->sb = sb;
__entry->nr_to_scan = nr_to_scan;
__entry->ret = ret;
),
TP_printk("sb %p nr_to_scan %lu ret %d",
__entry->sb, __entry->nr_to_scan, __entry->ret)
);
DEFINE_EVENT(scoutfs_shrink_exit_class, scoutfs_lock_shrink_exit,
TP_PROTO(struct super_block *sb, unsigned long nr_to_scan, int ret),
TP_ARGS(sb, nr_to_scan, ret)
);
TRACE_EVENT(scoutfs_rename,
TP_PROTO(struct super_block *sb, struct inode *old_dir,
struct dentry *old_dentry, struct inode *new_dir,
@@ -3251,24 +3097,6 @@ TRACE_EVENT(scoutfs_ioc_search_xattrs,
__entry->ino, __entry->last_ino)
);
TRACE_EVENT(scoutfs_trigger_fired,
TP_PROTO(struct super_block *sb, const char *name),
TP_ARGS(sb, name),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(const char *, name)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->name = name;
),
TP_printk(SCSBF" %s", SCSB_TRACE_ARGS, __entry->name)
);
#endif /* _TRACE_SCOUTFS_H */
/* This part must be outside protection */

View File

@@ -41,7 +41,6 @@
#include "recov.h"
#include "omap.h"
#include "fence.h"
#include "triggers.h"
/*
* Every active mount can act as the server that listens on a net
@@ -1292,13 +1291,9 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
* meta was low so that deleted items are merged
* promptly and freed blocks can bring the client out of
* enospc.
*
* The trigger can be used to force a log merge in cases where
* a test only generates small amounts of change.
*/
finalize_ours = (lt->item_root.height > 2) ||
(le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW) ||
scoutfs_trigger(sb, LOG_MERGE_FORCE_FINALIZE_OURS);
(le32_to_cpu(lt->meta_avail.flags) & SCOUTFS_ALLOC_FLAG_LOW);
trace_scoutfs_server_finalize_decision(sb, rid, saw_finalized, others_active,
ours_visible, finalize_ours, delay_ms,
@@ -1407,8 +1402,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
BUG_ON(err); /* inconsistent */
}
scoutfs_inc_counter(sb, log_merge_start);
/* we're done, caller can make forward progress */
break;
}
@@ -1625,8 +1618,7 @@ static int server_get_log_trees(struct super_block *sb,
goto update;
}
ret = alloc_move_empty(sb, &super->data_alloc, &lt.data_freed,
COMMIT_HOLD_ALLOC_BUDGET / 2);
ret = alloc_move_empty(sb, &super->data_alloc, &lt.data_freed, 100);
if (ret == -EINPROGRESS)
ret = 0;
if (ret < 0) {
@@ -1921,11 +1913,9 @@ static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
scoutfs_alloc_splice_list(sb, &server->alloc, &server->wri, server->other_freed,
&lt.meta_avail)) ?:
(err_str = "empty data_avail",
alloc_move_empty(sb, &super->data_alloc, &lt.data_avail,
COMMIT_HOLD_ALLOC_BUDGET / 2)) ?:
alloc_move_empty(sb, &super->data_alloc, &lt.data_avail, 100)) ?:
(err_str = "empty data_freed",
alloc_move_empty(sb, &super->data_alloc, &lt.data_freed,
COMMIT_HOLD_ALLOC_BUDGET / 2));
alloc_move_empty(sb, &super->data_alloc, &lt.data_freed, 100));
mutex_unlock(&server->alloc_mutex);
/* only finalize, allowing merging, once the allocators are fully freed */
@@ -2516,8 +2506,6 @@ static int splice_log_merge_completions(struct super_block *sb,
queue_work(server->wq, &server->log_merge_free_work);
else
err_str = "deleting merge status item";
scoutfs_inc_counter(sb, log_merge_complete);
out:
if (upd_stat) {
init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0);
@@ -3048,13 +3036,7 @@ static int server_commit_log_merge(struct super_block *sb,
SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0,
&stat, sizeof(stat));
if (ret < 0) {
/*
* During a retransmission, it's possible that the server
* already committed and resolved this log merge. ENOENT
* is expected in that case.
*/
if (ret != -ENOENT)
err_str = "getting merge status item";
err_str = "getting merge status item";
goto out;
}

View File

@@ -18,7 +18,6 @@
#include "super.h"
#include "triggers.h"
#include "scoutfs_trace.h"
/*
* We have debugfs files we can write to which arm triggers which
@@ -40,7 +39,6 @@ struct scoutfs_triggers {
static char *names[] = {
[SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE] = "block_remove_stale",
[SCOUTFS_TRIGGER_LOG_MERGE_FORCE_FINALIZE_OURS] = "log_merge_force_finalize_ours",
[SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE] = "srch_compact_logs_pad_safe",
[SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE] = "srch_force_log_rotate",
[SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE] = "srch_merge_stop_safe",
@@ -53,7 +51,6 @@ bool scoutfs_trigger_test_and_clear(struct super_block *sb, unsigned int t)
atomic_t *atom;
int old;
int mem;
bool fired;
BUG_ON(t >= SCOUTFS_TRIGGER_NR);
atom = &triggers->atomics[t];
@@ -67,12 +64,7 @@ bool scoutfs_trigger_test_and_clear(struct super_block *sb, unsigned int t)
mem = atomic_cmpxchg(atom, old, 0);
} while (mem && mem != old);
fired = !!mem;
if (fired)
trace_scoutfs_trigger_fired(sb, names[t]);
return fired;
return !!mem;
}
int scoutfs_setup_triggers(struct super_block *sb)

View File

@@ -3,7 +3,6 @@
enum scoutfs_trigger {
SCOUTFS_TRIGGER_BLOCK_REMOVE_STALE,
SCOUTFS_TRIGGER_LOG_MERGE_FORCE_FINALIZE_OURS,
SCOUTFS_TRIGGER_SRCH_COMPACT_LOGS_PAD_SAFE,
SCOUTFS_TRIGGER_SRCH_FORCE_LOG_ROTATE,
SCOUTFS_TRIGGER_SRCH_MERGE_STOP_SAFE,

View File

@@ -9,7 +9,7 @@
echo "$0 running rid '$SCOUTFS_FENCED_REQ_RID' ip '$SCOUTFS_FENCED_REQ_IP' args '$@'"
echo_fail() {
echo "$@" >&2
echo "$@" >> /dev/stderr
exit 1
}
@@ -27,7 +27,8 @@ for fs in /sys/fs/scoutfs/*; do
nr="$(quiet_cat $fs/data_device_maj_min)"
[ ! -d "$fs" -o "$fs_rid" != "$rid" ] && continue
mnt=$(findmnt -l -n -t scoutfs -o TARGET -S $nr)
mnt=$(findmnt -l -n -t scoutfs -o TARGET -S $nr) || \
echo_fail "findmnt -t scoutfs -S $nr failed"
[ -z "$mnt" ] && continue
if ! umount -qf "$mnt"; then

View File

@@ -170,9 +170,6 @@ t_filter_dmesg()
# some ci test guests are unresponsive
re="$re|longest quorum heartbeat .* delay"
# creating block devices may trigger this
re="$re|block device autoloading is deprecated and will be removed."
egrep -v "($re)" | \
ignore_harmless_unwind_kasan_stack_oob
}

View File

@@ -498,121 +498,3 @@ t_restore_all_sysfs_mount_options() {
t_set_sysfs_mount_option $i $name "${_saved_opts[$ind]}"
done
}
t_force_log_merge() {
local sv=$(t_server_nr)
local merges_started
local last_merges_started
local merges_completed
local last_merges_completed
while true; do
last_merges_started=$(t_counter log_merge_start $sv)
last_merges_completed=$(t_counter log_merge_complete $sv)
t_trigger_arm_silent log_merge_force_finalize_ours $sv
t_sync_seq_index
while test "$(t_trigger_get log_merge_force_finalize_ours $sv)" == "1"; do
sleep .5
done
merges_started=$(t_counter log_merge_start $sv)
if (( merges_started > last_merges_started )); then
merges_completed=$(t_counter log_merge_complete $sv)
while (( merges_completed == last_merges_completed )); do
sleep .5
merges_completed=$(t_counter log_merge_complete $sv)
done
break
fi
done
}
declare -A _last_scan
t_get_orphan_scan_runs() {
local i
for i in $(t_fs_nrs); do
_last_scan[$i]=$(t_counter orphan_scan $i)
done
}
t_wait_for_orphan_scan_runs() {
local i
local scan
t_get_orphan_scan_runs
for i in $(t_fs_nrs); do
while true; do
scan=$(t_counter orphan_scan $i)
if (( scan != _last_scan[$i] )); then
break
fi
sleep .5
done
done
}
declare -A _last_empty
t_get_orphan_scan_empty() {
local i
for i in $(t_fs_nrs); do
_last_empty[$i]=$(t_counter orphan_scan_empty $i)
done
}
t_wait_for_no_orphans() {
local i;
local working;
local empty;
t_get_orphan_scan_empty
while true; do
working=0
t_wait_for_orphan_scan_runs
for i in $(t_fs_nrs); do
empty=$(t_counter orphan_scan_empty $i)
if (( empty == _last_empty[$i] )); then
(( working++ ))
else
(( _last_empty[$i] = empty ))
fi
done
if (( working == 0 )); then
break
fi
sleep 1
done
}
#
# Repeatedly run the arguments as a command, sleeping in between, until
# it returns success. The first argument is a relative timeout in
# seconds. The remaining arguments are the command and its arguments.
#
# If the timeout expires without the command returning 0 then the test
# fails.
#
t_wait_until_timeout() {
local relative="$1"
local expire="$((SECONDS + relative))"
shift
while (( SECONDS < expire )); do
"$@" && return
sleep 1
done
t_fail "command failed for $relative sec: $@"
}

View File

@@ -43,14 +43,9 @@ t_tap_progress()
local testname=$1
local result=$2
local stmsg=""
local diff=""
local dmsg=""
if [[ -s $T_RESULTS/tmp/${testname}/status.msg ]]; then
stmsg="1"
fi
if [[ -s "$T_RESULTS/tmp/${testname}/dmesg.new" ]]; then
dmsg="1"
fi
@@ -66,7 +61,6 @@ t_tap_progress()
echo "# ${testname} ** skipped - permitted **"
else
echo "not ok ${i} - ${testname}"
case ${result} in
101)
echo "# ${testname} ** skipped **"
@@ -76,13 +70,6 @@ t_tap_progress()
;;
esac
if [[ -n "${stmsg}" ]]; then
echo "#"
echo "# status:"
echo "#"
cat $T_RESULTS/tmp/${testname}/status.msg | sed 's/^/# - /'
fi
if [[ -n "${diff}" ]]; then
echo "#"
echo "# diff:"

View File

@@ -17,7 +17,7 @@ ino not found in dseq index
mount 0 contents after mount 1 rm: contents
ino found in dseq index
ino found in dseq index
stat: cannot stat '/mnt/test/test/inode-deletion/badfile': No such file or directory
stat: cannot stat '/mnt/test/test/inode-deletion/file': No such file or directory
ino not found in dseq index
ino not found in dseq index
== lots of deletions use one open map

View File

@@ -400,8 +400,7 @@ if [ -n "$T_INSMOD" ]; then
fi
if [ -n "$T_TRACE_MULT" ]; then
# orig_trace_size=$(cat /sys/kernel/debug/tracing/buffer_size_kb)
orig_trace_size=1408
orig_trace_size=$(cat /sys/kernel/debug/tracing/buffer_size_kb)
mult_trace_size=$((orig_trace_size * T_TRACE_MULT))
msg "increasing trace buffer size from $orig_trace_size KiB to $mult_trace_size KiB"
echo $mult_trace_size > /sys/kernel/debug/tracing/buffer_size_kb
@@ -505,10 +504,7 @@ crash_monitor()
fi
if [ "$bad" != 0 ]; then
echo "run-tests monitor syncing and triggering crash"
# hail mary, the sync could well hang
(echo s > /proc/sysrq-trigger) &
sleep 5
echo "run-tests monitor triggering crash"
echo c > /proc/sysrq-trigger
exit 1
fi

View File

@@ -5,9 +5,6 @@
t_require_commands sleep touch grep sync scoutfs
t_require_mounts 2
# regularly see ~20/~30s
VERIFY_TIMEOUT_SECS=90
#
# Make sure that all mounts can read the results of a write from each
# mount.
@@ -43,10 +40,8 @@ verify_fenced_run()
for rid in $rids; do
grep -q ".* running rid '$rid'.* args 'ignored run args'" "$T_FENCED_LOG" || \
return 1
t_fail "fenced didn't execute RUN script for rid $rid"
done
return 0
}
echo "== make sure all mounts can see each other"
@@ -59,7 +54,14 @@ rid=$(t_mount_rid $cl)
echo "cl $cl sv $sv rid $rid" >> "$T_TMP.log"
sync
t_force_umount $cl
t_wait_until_timeout $VERIFY_TIMEOUT_SECS verify_fenced_run $rid
# wait for client reconnection to timeout
while grep -q $rid $(t_debugfs_path $sv)/connections; do
sleep .5
done
while t_rid_is_fencing $rid; do
sleep .5
done
verify_fenced_run $rid
t_mount $cl
check_read_write
@@ -81,7 +83,15 @@ for cl in $(t_fs_nrs); do
t_force_umount $cl
done
t_wait_until_timeout $VERIFY_TIMEOUT_SECS verify_fenced_run $rids
# wait for all client reconnections to timeout
while egrep -q "($pattern)" $(t_debugfs_path $sv)/connections; do
sleep .5
done
# wait for all fence requests to complete
while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
sleep .5
done
verify_fenced_run $rids
# remount all the clients
for cl in $(t_fs_nrs); do
if [ $cl == $sv ]; then
@@ -97,7 +107,12 @@ rid=$(t_mount_rid $sv)
echo "sv $sv rid $rid" >> "$T_TMP.log"
sync
t_force_umount $sv
t_wait_until_timeout $VERIFY_TIMEOUT_SECS verify_fenced_run $rid
t_wait_for_leader
# wait until new server is done fencing unmounted leader rid
while t_rid_is_fencing $rid; do
sleep .5
done
verify_fenced_run $rid
t_mount $sv
check_read_write
@@ -112,7 +127,11 @@ for nr in $(t_fs_nrs); do
t_force_umount $nr
done
t_mount_all
t_wait_until_timeout $VERIFY_TIMEOUT_SECS verify_fenced_run $rids
# wait for all fence requests to complete
while test -d $(echo /sys/fs/scoutfs/*/fence/* | cut -d " " -f 1); do
sleep .5
done
verify_fenced_run $rids
check_read_write
t_pass

View File

@@ -72,7 +72,7 @@ touch $T_D0/dir/file
mkdir $T_D0/dir/dir
ln -s $T_D0/dir/file $T_D0/dir/symlink
mknod $T_D0/dir/char c 1 3 # null
mknod $T_D0/dir/block b 42 0 # SAMPLE block dev - nonexistant/demo use only number
mknod $T_D0/dir/block b 7 0 # loop0
for name in $(ls -UA $T_D0/dir | sort); do
ino=$(stat -c '%i' $T_D0/dir/$name)
$GRE $ino | filter_types

View File

@@ -61,28 +61,18 @@ rm -f "$T_D1/file"
check_ino_index "$ino" "$dseq" "$T_M0"
check_ino_index "$ino" "$dseq" "$T_M1"
# Hurry along the orphan scanners. If any are currently asleep, we will
# have to wait at least their current scan interval before they wake up,
# run, and notice their new interval.
t_save_all_sysfs_mount_options orphan_scan_delay_ms
t_set_all_sysfs_mount_options orphan_scan_delay_ms 500
t_wait_for_orphan_scan_runs
echo "== unlink wait for open on other mount"
echo "contents" > "$T_D0/badfile"
ino=$(stat -c "%i" "$T_D0/badfile")
dseq=$(scoutfs stat -s data_seq "$T_D0/badfile")
exec {FD}<"$T_D0/badfile"
rm -f "$T_D1/badfile"
echo "contents" > "$T_D0/file"
ino=$(stat -c "%i" "$T_D0/file")
dseq=$(scoutfs stat -s data_seq "$T_D0/file")
exec {FD}<"$T_D0/file"
rm -f "$T_D1/file"
echo "mount 0 contents after mount 1 rm: $(cat <&$FD)"
check_ino_index "$ino" "$dseq" "$T_M0"
check_ino_index "$ino" "$dseq" "$T_M1"
exec {FD}>&- # close
# we know that revalidating will unhash the remote dentry
stat "$T_D0/badfile" 2>&1 | sed 's/cannot statx/cannot stat/' | t_filter_fs
t_force_log_merge
# wait for orphan scanners to pick up the unlinked inode and become idle
t_wait_for_no_orphans
stat "$T_D0/file" 2>&1 | sed 's/cannot statx/cannot stat/' | t_filter_fs
check_ino_index "$ino" "$dseq" "$T_M0"
check_ino_index "$ino" "$dseq" "$T_M1"
@@ -93,20 +83,16 @@ rm -f "$T_D0/dir"/files-*
rmdir "$T_D0/dir"
echo "== open files survive remote scanning orphans"
echo "contents" > "$T_D0/lastfile"
ino=$(stat -c "%i" "$T_D0/lastfile")
dseq=$(scoutfs stat -s data_seq "$T_D0/lastfile")
exec {FD}<"$T_D0/lastfile"
rm -f "$T_D0/lastfile"
echo "contents" > "$T_D0/file"
ino=$(stat -c "%i" "$T_D0/file")
dseq=$(scoutfs stat -s data_seq "$T_D0/file")
exec {FD}<"$T_D0/file"
rm -f "$T_D0/file"
t_umount 1
t_mount 1
echo "mount 0 contents after mount 1 remounted: $(cat <&$FD)"
exec {FD}>&- # close
t_force_log_merge
t_wait_for_no_orphans
check_ino_index "$ino" "$dseq" "$T_M0"
check_ino_index "$ino" "$dseq" "$T_M1"
t_restore_all_sysfs_mount_options orphan_scan_delay_ms
t_pass

View File

@@ -62,7 +62,7 @@ test_timeout()
sleep 1
# tear down the current server/leader
t_force_umount $sv &
t_force_umount $sv
# see how long it takes for the next leader to start
start=$(time_ms)
@@ -73,7 +73,6 @@ test_timeout()
echo "to $to delay $delay" >> $T_TMP.delay
# restore the mount that we tore down
wait
t_mount $sv
# make sure the new leader delay was reasonable, allowing for some slack

View File

@@ -8,19 +8,19 @@ t_require_mounts 2
echo "=== renameat2 noreplace flag test"
# give each mount their own dir (lock group) to minimize create contention
mkdir $T_D0/dir0
mkdir $T_D1/dir1
mkdir $T_M0/dir0
mkdir $T_M1/dir1
echo "=== run two asynchronous calls to renameat2 NOREPLACE"
for i in $(seq 0 100); do
# prepare inputs in isolation
touch "$T_D0/dir0/old0"
touch "$T_D1/dir1/old1"
touch "$T_M0/dir0/old0"
touch "$T_M1/dir1/old1"
# race doing noreplace renames, both can't succeed
dumb_renameat2 -n "$T_D0/dir0/old0" "$T_D0/dir0/sharednew" 2> /dev/null &
dumb_renameat2 -n "$T_M0/dir0/old0" "$T_M0/dir0/sharednew" 2> /dev/null &
pid0=$!
dumb_renameat2 -n "$T_D1/dir1/old1" "$T_D1/dir0/sharednew" 2> /dev/null &
dumb_renameat2 -n "$T_M1/dir1/old1" "$T_M1/dir0/sharednew" 2> /dev/null &
pid1=$!
wait $pid0
@@ -31,7 +31,7 @@ for i in $(seq 0 100); do
test "$rc0" == 0 -a "$rc1" == 0 && t_fail "both renames succeeded"
# blow away possible files for either race outcome
rm -f "$T_D0/dir0/old0" "$T_D1/dir1/old1" "$T_D0/dir0/sharednew" "$T_D1/dir1/sharednew"
rm -f "$T_M0/dir0/old0" "$T_M1/dir1/old1" "$T_M0/dir0/sharednew" "$T_M1/dir1/sharednew"
done
t_pass

View File

@@ -7,7 +7,7 @@ message_output()
error_message()
{
message_output "$@" >&2
message_output "$@" >> /dev/stderr
}
error_exit()

View File

@@ -63,22 +63,6 @@ mounts because there are more locks that cover the same number of
created files. This can be helpful when working with smaller numbers of
large files.
.TP
.B lock_idle_count=<number>
This option sets the number of locks that the client will allow to
remain idle after being granted. If the number of locks exceeds this
count then the client will try to free the oldest locks. This setting
is per-mount and only changes the behavior of that mount.
.sp
Idle locks are not reclaimed by memory pressure so this option
determines the limit of how much memory is likely to be pinned by
allocated idle locks. Setting this too low can increase latency of
operations as repeated use of a working set of locks has to request the
locks from the network rather than using granted idle locks.
.sp
The count is not strictly enforced. Operations are allowed to use locks
while over the limit to avoid deadlocks under heavy concurrent load.
Exceeding the count only attempts freeing of idle locks.
.TP
.B log_merge_wait_timeout_ms=<number>
This option sets the amount of time, in milliseconds, that log merge
creation can wait before timing out. This setting is per-mount, only

View File

@@ -402,25 +402,45 @@ before destroying an old empty data device.
.PD
.TP
.BI "print {-S|--skip-likely-huge} META-DEVICE"
.BI "print {-a|--allocs} {-i|--items ITEMS} {-r|--roots ROOTS} {-S|--skip-likely-huge} {-V|--xattr-values} META-DEVICE"
.sp
Prints out all of the metadata in the file system. This makes no effort
Prints out some or all of the metadata in the file system. This makes no effort
to ensure that the structures are consistent as they're traversed and
can present structures that seem corrupt as they change as they're
output.
.sp
Structures that are related to the number of mounts and are maintained at a
relatively reasonable size are always printed. These include per-mount log
trees, srch files, allocators, and the metadata allocators used by server
commits. Other btrees and their items can be selected as desired.
.RS 1.0i
.PD 0
.TP
.sp
.TP
.B "-a, --allocs"
Print the metadata and data allocators. Enabled by default.
.TP
.B "-r, --roots ROOTS"
This option can be used to select which btrees are traversed. It is a comma-separated list containing one or more of the following btree roots: logs, srch, fs. Default is all roots.
.TP
.B "-i, --items ITEMS"
This option can be used to choose which btree items are printed from the
selected btree roots. It is a comma-separated list containing one or
more of the following items: inode, xattr, dirent, symlink, backref, extent,
totl, indx, inoindex, orphan, quota.
Default is all items.
.TP
.B "-V, --xattr-values"
Print xattr values alongside the xattr item. Non-printable bytes are
rendered as '.'. A trailing '...' indicates the value continues in
additional item parts that aren't shown.
.TP
.B "-S, --skip-likely-huge"
Skip printing structures that are likely to be very large. The
structures that are skipped tend to be global and whose size tends to be
related to the size of the volume. Examples of skipped structures include
the global fs items, srch files, and metadata and data
allocators. Similar structures that are not skipped are related to the
number of mounts and are maintained at a relatively reasonable size.
These include per-mount log trees, srch files, allocators, and the
metadata allocators used by server commits.
allocators.
.sp
Skipping the larger structures limits the print output to a relatively
constant size rather than being a large multiple of the used metadata

View File

@@ -29,6 +29,54 @@
#include "leaf_item_hash.h"
#include "dev.h"
struct print_args {
char *meta_device;
bool skip_likely_huge;
bool roots_requested;
bool items_requested;
bool allocs_requested;
bool walk_allocs;
bool walk_logs_root;
bool walk_fs_root;
bool walk_srch_root;
bool print_inodes;
bool print_xattrs;
bool print_dirents;
bool print_symlinks;
bool print_backrefs;
bool print_extents;
bool print_totl;
bool print_indx;
bool print_inode_index;
bool print_orphan;
bool print_quota;
bool print_xattr_values;
};
static struct print_args print_args = {
.meta_device = NULL,
.skip_likely_huge = false,
.roots_requested = false,
.items_requested = false,
.allocs_requested = false,
.walk_allocs = true,
.walk_logs_root = true,
.walk_fs_root = true,
.walk_srch_root = true,
.print_inodes = true,
.print_xattrs = true,
.print_dirents = true,
.print_symlinks = true,
.print_backrefs = true,
.print_extents = true,
.print_totl = true,
.print_indx = true,
.print_inode_index = true,
.print_orphan = true,
.print_quota = true,
.print_xattr_values = false
};
static void print_block_header(struct scoutfs_block_header *hdr, int size)
{
u32 crc = crc_block(hdr, size);
@@ -135,15 +183,42 @@ static u8 *global_printable_name(u8 *name, int name_len)
static void print_xattr(struct scoutfs_key *key, void *val, int val_len)
{
struct scoutfs_xattr *xat = val;
unsigned int full_val_len;
int avail;
int show;
int i;
printf(" xattr: ino %llu name_hash %08x id %llu part %u\n",
le64_to_cpu(key->skx_ino), (u32)le64_to_cpu(key->skx_name_hash),
le64_to_cpu(key->skx_id), key->skx_part);
if (key->skx_part == 0)
printf(" name_len %u val_len %u name %s\n",
xat->name_len, le16_to_cpu(xat->val_len),
global_printable_name(xat->name, xat->name_len));
if (key->skx_part != 0)
return;
full_val_len = le16_to_cpu(xat->val_len);
printf(" name_len %u val_len %u name %s",
xat->name_len, full_val_len,
global_printable_name(xat->name, xat->name_len));
if (!print_args.print_xattr_values) {
putchar('\n');
return;
}
avail = val_len - (int)sizeof(*xat) - xat->name_len;
if (avail < 0)
avail = 0;
show = avail < (int)full_val_len ? avail : (int)full_val_len;
printf(" value ");
for (i = 0; i < show; i++) {
u8 c = xat->name[xat->name_len + i];
putchar(isprint(c) ? c : '.');
}
if (show < (int)full_val_len)
printf("...");
putchar('\n');
}
static void print_dirent(struct scoutfs_key *key, void *val, int val_len)
@@ -195,36 +270,72 @@ static void print_inode_index(struct scoutfs_key *key, void *val, int val_len)
typedef void (*print_func_t)(struct scoutfs_key *key, void *val, int val_len);
static print_func_t find_printer(u8 zone, u8 type)
static print_func_t find_printer(u8 zone, u8 type, bool *suppress)
{
if (zone == SCOUTFS_INODE_INDEX_ZONE &&
type >= SCOUTFS_INODE_INDEX_META_SEQ_TYPE &&
type <= SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE)
type <= SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE) {
if (!print_args.print_inode_index)
*suppress = true;
return print_inode_index;
if (zone == SCOUTFS_ORPHAN_ZONE) {
if (type == SCOUTFS_ORPHAN_TYPE)
return print_orphan;
}
if (zone == SCOUTFS_QUOTA_ZONE)
if (zone == SCOUTFS_ORPHAN_ZONE) {
if (type == SCOUTFS_ORPHAN_TYPE) {
if (!print_args.print_orphan)
*suppress = true;
return print_orphan;
}
}
if (zone == SCOUTFS_QUOTA_ZONE) {
if (!print_args.print_quota)
*suppress = true;
return print_quota;
}
if (zone == SCOUTFS_XATTR_TOTL_ZONE)
if (zone == SCOUTFS_XATTR_TOTL_ZONE) {
if (!print_args.print_totl)
*suppress = true;
return print_xattr_totl;
}
if (zone == SCOUTFS_XATTR_INDX_ZONE)
if (zone == SCOUTFS_XATTR_INDX_ZONE) {
if (!print_args.print_indx)
*suppress = true;
return print_xattr_indx;
}
if (zone == SCOUTFS_FS_ZONE) {
switch(type) {
case SCOUTFS_INODE_TYPE: return print_inode;
case SCOUTFS_XATTR_TYPE: return print_xattr;
case SCOUTFS_DIRENT_TYPE: return print_dirent;
case SCOUTFS_READDIR_TYPE: return print_dirent;
case SCOUTFS_SYMLINK_TYPE: return print_symlink;
case SCOUTFS_LINK_BACKREF_TYPE: return print_dirent;
case SCOUTFS_DATA_EXTENT_TYPE: return print_data_extent;
case SCOUTFS_INODE_TYPE:
if (!print_args.print_inodes)
*suppress = true;
return print_inode;
case SCOUTFS_XATTR_TYPE:
if (!print_args.print_xattrs)
*suppress = true;
return print_xattr;
case SCOUTFS_DIRENT_TYPE:
if (!print_args.print_dirents)
*suppress = true;
return print_dirent;
case SCOUTFS_READDIR_TYPE:
if (!print_args.print_dirents)
*suppress = true;
return print_dirent;
case SCOUTFS_SYMLINK_TYPE:
if (!print_args.print_symlinks)
*suppress = true;
return print_symlink;
case SCOUTFS_LINK_BACKREF_TYPE:
if (!print_args.print_backrefs)
*suppress = true;
return print_dirent;
case SCOUTFS_DATA_EXTENT_TYPE:
if (!print_args.print_extents)
*suppress = true;
return print_data_extent;
}
}
@@ -244,12 +355,16 @@ static int print_fs_item(struct scoutfs_key *key, u64 seq, u8 flags, void *val,
/* only items in leaf blocks have values */
if (val != NULL && !(flags & SCOUTFS_ITEM_FLAG_DELETION)) {
printer = find_printer(key->sk_zone, key->sk_type);
if (printer)
printer(key, val, val_len);
else
bool suppress = false;
printer = find_printer(key->sk_zone, key->sk_type, &suppress);
if (printer) {
if (!suppress)
printer(key, val, val_len);
} else {
printf(" (unknown zone %u type %u)\n",
key->sk_zone, key->sk_type);
}
}
return 0;
@@ -1037,12 +1152,7 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
}
}
struct print_args {
char *meta_device;
bool skip_likely_huge;
};
static int print_volume(int fd, struct print_args *args)
static int print_volume(int fd)
{
struct scoutfs_super_block *super = NULL;
struct print_recursion_args pa;
@@ -1092,7 +1202,7 @@ static int print_volume(int fd, struct print_args *args)
ret = err;
}
if (!args->skip_likely_huge) {
if (print_args.walk_allocs) {
for (i = 0; i < array_size(super->meta_alloc); i++) {
snprintf(str, sizeof(str), "meta_alloc[%u]", i);
err = print_btree(fd, super, str, &super->meta_alloc[i].root,
@@ -1119,18 +1229,21 @@ static int print_volume(int fd, struct print_args *args)
pa.super = super;
pa.fd = fd;
if (!args->skip_likely_huge) {
if (print_args.walk_srch_root) {
err = print_btree_leaf_items(fd, super, &super->srch_root.ref,
print_srch_root_files, &pa);
if (err && !ret)
ret = err;
}
err = print_btree_leaf_items(fd, super, &super->logs_root.ref,
print_log_trees_roots, &pa);
if (err && !ret)
ret = err;
if (!args->skip_likely_huge) {
if (print_args.walk_logs_root) {
err = print_btree_leaf_items(fd, super, &super->logs_root.ref,
print_log_trees_roots, &pa);
if (err && !ret)
ret = err;
}
if (print_args.walk_fs_root) {
err = print_btree(fd, super, "fs_root", &super->fs_root,
print_fs_item, NULL);
if (err && !ret)
@@ -1143,16 +1256,16 @@ out:
return ret;
}
static int do_print(struct print_args *args)
static int do_print(void)
{
int ret;
int fd;
fd = open(args->meta_device, O_RDONLY);
fd = open(print_args.meta_device, O_RDONLY);
if (fd < 0) {
ret = -errno;
fprintf(stderr, "failed to open '%s': %s (%d)\n",
args->meta_device, strerror(errno), errno);
print_args.meta_device, strerror(errno), errno);
return ret;
}
@@ -1160,30 +1273,203 @@ static int do_print(struct print_args *args)
if (ret < 0)
goto out;
ret = print_volume(fd, args);
ret = print_volume(fd);
out:
close(fd);
return ret;
};
enum {
LOGS_OPT = 0,
FS_OPT,
SRCH_OPT
};
static char *const root_tokens[] = {
[LOGS_OPT] = "logs",
[FS_OPT] = "fs",
[SRCH_OPT] = "srch",
NULL
};
enum {
INODE_OPT = 0,
XATTR_OPT,
DIRENT_OPT,
SYMLINK_OPT,
BACKREF_OPT,
EXTENT_OPT,
TOTL_OPT,
INDX_OPT,
INOINDEX_OPT,
ORPHAN_OPT,
QUOTA_OPT
};
static char *const item_tokens[] = {
[INODE_OPT] = "inode",
[XATTR_OPT] = "xattr",
[DIRENT_OPT] = "dirent",
[SYMLINK_OPT] = "symlink",
[BACKREF_OPT] = "backref",
[EXTENT_OPT] = "extent",
[TOTL_OPT] = "totl",
[INDX_OPT] = "indx",
[INOINDEX_OPT] = "inoindex",
[ORPHAN_OPT] = "orphan",
[QUOTA_OPT] = "quota",
NULL
};
static void clear_items(void)
{
print_args.print_inodes = false;
print_args.print_xattrs = false;
print_args.print_dirents = false;
print_args.print_symlinks = false;
print_args.print_backrefs = false;
print_args.print_extents = false;
print_args.print_totl = false;
print_args.print_indx = false;
print_args.print_inode_index = false;
print_args.print_orphan = false;
print_args.print_quota = false;
}
static void clear_roots(void)
{
print_args.walk_logs_root = false;
print_args.walk_fs_root = false;
print_args.walk_srch_root = false;
}
static int parse_opt(int key, char *arg, struct argp_state *state)
{
struct print_args *args = state->input;
char *subopts;
char *value;
bool parse_err = false;
switch (key) {
case 'S':
args->skip_likely_huge = true;
break;
case 'a':
args->allocs_requested = true;
args->walk_allocs = true;
break;
case 'V':
args->print_xattr_values = true;
break;
case 'i':
/* Specific items being requested- clear them all to start */
if (!args->items_requested) {
clear_items();
if (!args->allocs_requested)
args->walk_allocs = false;
args->items_requested = true;
}
subopts = arg;
while (*subopts != '\0' && !parse_err) {
switch (getsubopt(&subopts, item_tokens, &value)) {
case INODE_OPT:
args->print_inodes = true;
break;
case XATTR_OPT:
args->print_xattrs = true;
break;
case DIRENT_OPT:
args->print_dirents = true;
break;
case SYMLINK_OPT:
args->print_symlinks = true;
break;
case BACKREF_OPT:
args->print_backrefs = true;
break;
case EXTENT_OPT:
args->print_extents = true;
break;
case TOTL_OPT:
args->print_totl = true;
break;
case INDX_OPT:
args->print_indx = true;
break;
case INOINDEX_OPT:
args->print_inode_index = true;
break;
case ORPHAN_OPT:
args->print_orphan = true;
break;
case QUOTA_OPT:
args->print_quota = true;
break;
default:
argp_usage(state);
parse_err = true;
break;
}
}
break;
case 'r':
/* Specific roots being requested- clear them all to start */
if (!args->roots_requested) {
clear_roots();
if (!args->allocs_requested)
args->walk_allocs = false;
args->roots_requested = true;
}
subopts = arg;
while (*subopts != '\0' && !parse_err) {
switch (getsubopt(&subopts, root_tokens, &value)) {
case LOGS_OPT:
args->walk_logs_root = true;
break;
case FS_OPT:
args->walk_fs_root = true;
break;
case SRCH_OPT:
args->walk_srch_root = true;
break;
default:
argp_usage(state);
parse_err = true;
break;
}
}
break;
case ARGP_KEY_ARG:
if (!args->meta_device)
args->meta_device = strdup_or_error(state, arg);
else
argp_error(state, "more than one argument given");
break;
case ARGP_KEY_FINI:
if (!args->meta_device)
argp_error(state, "no metadata device argument given");
/*
* For backwards compatibility, translate -S. Should we warn if
* this conflicts with other explicit options?
*/
if (args->skip_likely_huge) {
if (!args->allocs_requested)
args->walk_allocs = false;
args->walk_fs_root = false;
args->walk_srch_root = false;
}
break;
default:
break;
}
@@ -1192,7 +1478,11 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
}
static struct argp_option options[] = {
{ "skip-likely-huge", 'S', NULL, 0, "Skip large structures to minimize output size"},
{ "allocs", 'a', NULL, 0, "Print metadata and data alloc lists" },
{ "items", 'i', "ITEMS", 0, "Item(s) to print (inode, xattr, dirent, symlink, backref, extent, totl, indx, inoindex, orphan, quota)" },
{ "roots", 'r', "ROOTS", 0, "Tree root(s) to walk (logs, srch, fs)" },
{ "skip-likely-huge", 'S', NULL, 0, "Skip allocs, srch root and fs root to minimize output size" },
{ "xattr-values", 'V', NULL, 0, "Print xattr values (non-printable bytes rendered as '.')" },
{ NULL }
};
@@ -1205,17 +1495,15 @@ static struct argp argp = {
static int print_cmd(int argc, char **argv)
{
struct print_args print_args = {NULL};
int ret;
ret = argp_parse(&argp, argc, argv, 0, NULL, &print_args);
if (ret)
return ret;
return do_print(&print_args);
return do_print();
}
static void __attribute__((constructor)) print_ctor(void)
{
cmd_register_argp("print", &argp, GROUP_DEBUG, print_cmd);