mirror of
https://github.com/versity/scoutfs.git
synced 2026-04-30 01:46:54 +00:00
Compare commits
11 Commits
auke/make_
...
zab/get_ch
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0c3085c6e8 | ||
|
|
97f2d1ce8d | ||
|
|
bd14a369e3 | ||
|
|
448dd74663 | ||
|
|
57785066c0 | ||
|
|
b23022444a | ||
|
|
64931c395d | ||
|
|
380442515e | ||
|
|
750e998e40 | ||
|
|
747a8bc53d | ||
|
|
77327ae713 |
@@ -13,6 +13,7 @@ scoutfs-y += \
|
||||
avl.o \
|
||||
alloc.o \
|
||||
block.o \
|
||||
bsearch_index.o \
|
||||
btree.o \
|
||||
client.o \
|
||||
counters.o \
|
||||
@@ -36,6 +37,7 @@ scoutfs-y += \
|
||||
per_task.o \
|
||||
quorum.o \
|
||||
quota.o \
|
||||
raw.o \
|
||||
recov.o \
|
||||
scoutfs_trace.o \
|
||||
server.o \
|
||||
|
||||
@@ -24,7 +24,6 @@
|
||||
#include "trans.h"
|
||||
#include "alloc.h"
|
||||
#include "counters.h"
|
||||
#include "msg.h"
|
||||
#include "scoutfs_trace.h"
|
||||
|
||||
/*
|
||||
@@ -497,11 +496,10 @@ static int dirty_alloc_blocks(struct super_block *sb,
|
||||
struct scoutfs_block *fr_bl = NULL;
|
||||
struct scoutfs_block *bl;
|
||||
bool link_orig = false;
|
||||
__le32 orig_first_nr;
|
||||
u64 av_peek;
|
||||
u64 av_old = 0;
|
||||
u64 av_old;
|
||||
u64 fr_peek;
|
||||
u64 fr_old = 0;
|
||||
u64 fr_old;
|
||||
int ret;
|
||||
|
||||
if (alloc->dirty_avail_bl != NULL)
|
||||
@@ -511,7 +509,6 @@ static int dirty_alloc_blocks(struct super_block *sb,
|
||||
|
||||
/* undo dirty freed if we get an error after */
|
||||
orig_freed = alloc->freed.ref;
|
||||
orig_first_nr = alloc->freed.first_nr;
|
||||
|
||||
if (alloc->dirty_avail_bl != NULL) {
|
||||
ret = 0;
|
||||
@@ -565,17 +562,6 @@ static int dirty_alloc_blocks(struct super_block *sb,
|
||||
/* sort dirty avail to encourage contiguous sorted meta blocks */
|
||||
list_block_sort(av_bl->data);
|
||||
|
||||
lblk = fr_bl->data;
|
||||
if (WARN_ON_ONCE(alloc->freed.ref.blkno != lblk->hdr.blkno)) {
|
||||
scoutfs_err(sb, "dirty_alloc freed ref %llu hdr %llu av_old %llu fr_old %llu av_peek %llu fr_peek %llu link_orig %d",
|
||||
le64_to_cpu(alloc->freed.ref.blkno),
|
||||
le64_to_cpu(lblk->hdr.blkno),
|
||||
av_old, fr_old, av_peek, fr_peek, link_orig);
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
lblk = NULL;
|
||||
|
||||
if (av_old)
|
||||
list_block_add(&alloc->freed, fr_bl->data, av_old);
|
||||
if (fr_old)
|
||||
@@ -592,7 +578,6 @@ out:
|
||||
if (fr_bl)
|
||||
scoutfs_block_writer_forget(sb, wri, fr_bl);
|
||||
alloc->freed.ref = orig_freed;
|
||||
alloc->freed.first_nr = orig_first_nr;
|
||||
}
|
||||
|
||||
mutex_unlock(&alloc->mutex);
|
||||
|
||||
@@ -218,7 +218,6 @@ static void block_free_work(struct work_struct *work)
|
||||
|
||||
llist_for_each_entry_safe(bp, tmp, deleted, free_node) {
|
||||
block_free(sb, bp);
|
||||
cond_resched();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -468,6 +467,9 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
|
||||
sector_t sector;
|
||||
int ret = 0;
|
||||
|
||||
if (scoutfs_forcing_unmount(sb))
|
||||
return -ENOLINK;
|
||||
|
||||
sector = bp->bl.blkno << (SCOUTFS_BLOCK_LG_SHIFT - 9);
|
||||
|
||||
WARN_ON_ONCE(bp->bl.blkno == U64_MAX);
|
||||
@@ -478,17 +480,6 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
|
||||
set_bit(BLOCK_BIT_IO_BUSY, &bp->bits);
|
||||
block_get(bp);
|
||||
|
||||
/*
|
||||
* A second thread may already be waiting on this block's completion
|
||||
* after this thread won the race to submit the block. We exit through
|
||||
* the block_end_io error path which sets BLOCK_BIT_ERROR and assures
|
||||
* that other callers in the waitq get woken up.
|
||||
*/
|
||||
if (scoutfs_forcing_unmount(sb)) {
|
||||
ret = -ENOLINK;
|
||||
goto end_io;
|
||||
}
|
||||
|
||||
blk_start_plug(&plug);
|
||||
|
||||
for (off = 0; off < SCOUTFS_BLOCK_LG_SIZE; off += PAGE_SIZE) {
|
||||
@@ -526,7 +517,6 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
|
||||
end_io:
|
||||
/* let racing end_io know we're done */
|
||||
block_end_io(sb, opf, bp, ret);
|
||||
|
||||
@@ -846,8 +836,6 @@ int scoutfs_block_dirty_ref(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
bp = BLOCK_PRIVATE(bl);
|
||||
|
||||
if (block_is_dirty(bp)) {
|
||||
if (ref_blkno)
|
||||
*ref_blkno = 0;
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
#ifndef _SCOUTFS_BLOCK_H_
|
||||
#define _SCOUTFS_BLOCK_H_
|
||||
|
||||
struct scoutfs_alloc;
|
||||
|
||||
struct scoutfs_block_writer {
|
||||
spinlock_t lock;
|
||||
struct list_head dirty_list;
|
||||
|
||||
59
kmod/src/bsearch_index.c
Normal file
59
kmod/src/bsearch_index.c
Normal file
@@ -0,0 +1,59 @@
|
||||
/*
|
||||
* Copyright (C) 2026 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/bsearch.h>
|
||||
|
||||
#include "bsearch_index.h"
|
||||
|
||||
struct bsearch_index_key {
|
||||
int (*cmp)(const void *key, const void *elt);
|
||||
/* the key has to be const, so we have to update the index through a pointer */
|
||||
void **index_elt;
|
||||
const void *key;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
static int cmp_index(const void *key, const void *elt)
|
||||
{
|
||||
const struct bsearch_index_key *bik = key;
|
||||
int cmp = bik->cmp(bik->key, elt);
|
||||
|
||||
if (cmp > 0)
|
||||
*(bik->index_elt) = (void *)elt + bik->size;
|
||||
else
|
||||
*(bik->index_elt) = (void *)elt;
|
||||
|
||||
return cmp;
|
||||
}
|
||||
|
||||
/*
|
||||
* A bsearch() wrapper that returns the index of the element of the
|
||||
* array that the key would be stored in to maintain sort order. It's
|
||||
* the first element where the existing element is greater than the key.
|
||||
* It returns the size of the array if the key is greater than the last
|
||||
* element in the array.
|
||||
*/
|
||||
size_t bsearch_index(const void *key, const void *base, size_t num, size_t size,
|
||||
int (*cmp)(const void *key, const void *elt))
|
||||
{
|
||||
void *index_elt = (void *)base;
|
||||
struct bsearch_index_key bik = {
|
||||
.cmp = cmp,
|
||||
.index_elt = &index_elt,
|
||||
.key = key,
|
||||
.size = size,
|
||||
};
|
||||
|
||||
bsearch(&bik, base, num, size, cmp_index);
|
||||
return ((unsigned long)index_elt - (unsigned long)base) / size;
|
||||
}
|
||||
7
kmod/src/bsearch_index.h
Normal file
7
kmod/src/bsearch_index.h
Normal file
@@ -0,0 +1,7 @@
|
||||
#ifndef _SCOUTFS_BSEARCH_INDEX_H_
|
||||
#define _SCOUTFS_BSEARCH_INDEX_H_
|
||||
|
||||
size_t bsearch_index(const void *key, const void *base, size_t num, size_t size,
|
||||
int (*cmp)(const void *key, const void *elt));
|
||||
|
||||
#endif
|
||||
@@ -1816,6 +1816,11 @@ int scoutfs_btree_dirty(struct super_block *sb,
|
||||
* Call the users callback on all the items in the leaf that we find.
|
||||
* We also set the caller's keys for the first and last possible keys
|
||||
* that could exist in the leaf block.
|
||||
*
|
||||
* The callback can set a new key to continue reading from rather than
|
||||
* iterating over all the items. It modifies the key and returns
|
||||
* -ESRCH, which performs a new avl search. If the modified key falls
|
||||
* outside of the range of keys in the block then we return.
|
||||
*/
|
||||
int scoutfs_btree_read_items(struct super_block *sb,
|
||||
struct scoutfs_btree_root *root,
|
||||
@@ -1829,6 +1834,7 @@ int scoutfs_btree_read_items(struct super_block *sb,
|
||||
struct scoutfs_avl_node *next_node;
|
||||
struct scoutfs_avl_node *node;
|
||||
struct btree_walk_key_range kr;
|
||||
struct scoutfs_key cb_key;
|
||||
struct scoutfs_block *bl;
|
||||
int ret;
|
||||
|
||||
@@ -1842,22 +1848,32 @@ int scoutfs_btree_read_items(struct super_block *sb,
|
||||
if (scoutfs_key_compare(&kr.end, end) < 0)
|
||||
*end = kr.end;
|
||||
|
||||
node = scoutfs_avl_search(&bt->item_root, cmp_key_item, start, NULL,
|
||||
cb_key = *start;
|
||||
search:
|
||||
node = scoutfs_avl_search(&bt->item_root, cmp_key_item, &cb_key, NULL,
|
||||
NULL, &next_node, NULL) ?: next_node;
|
||||
while (node) {
|
||||
item = node_item(node);
|
||||
if (scoutfs_key_compare(&item->key, end) > 0)
|
||||
break;
|
||||
|
||||
ret = cb(sb, item_key(item), le64_to_cpu(item->seq), item->flags,
|
||||
cb_key = *item_key(item);
|
||||
ret = cb(sb, &cb_key, le64_to_cpu(item->seq), item->flags,
|
||||
item_val(bt, item), item_val_len(item), arg);
|
||||
if (ret < 0)
|
||||
break;
|
||||
if (ret < 0) {
|
||||
if (ret == -ESRCH) {
|
||||
if (scoutfs_key_compare(&cb_key, start) >= 0)
|
||||
goto search;
|
||||
ret = 0;
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
|
||||
node = scoutfs_avl_next(&bt->item_root, node);
|
||||
}
|
||||
|
||||
scoutfs_block_put(sb, bl);
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -59,31 +59,6 @@ struct client_info {
|
||||
struct completion farewell_comp;
|
||||
};
|
||||
|
||||
/*
|
||||
* Reconnection to a new server completes pending sync requests with
|
||||
* -ECONNRESET because their state in the old server was reclaimed at
|
||||
* fence time. Transparently retry so callers don't surface the
|
||||
* reconnect as a failed RPC; preserve the pre-drain behavior where a
|
||||
* sync request was silently resent across failover. Shutdown paths
|
||||
* break the loop via the errors that submit and wait already return.
|
||||
*/
|
||||
static int client_sync_request(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, void *arg, unsigned arg_len,
|
||||
void *resp, size_t resp_len)
|
||||
{
|
||||
int ret;
|
||||
|
||||
for (;;) {
|
||||
ret = scoutfs_net_sync_request(sb, conn, cmd, arg, arg_len,
|
||||
resp, resp_len);
|
||||
if (ret != -ECONNRESET)
|
||||
return ret;
|
||||
if (scoutfs_unmounting(sb) || scoutfs_forcing_unmount(sb))
|
||||
return -ESHUTDOWN;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Ask for a new run of allocated inode numbers. The server can return
|
||||
* fewer than @count. It will success with nr == 0 if we've run out.
|
||||
@@ -97,10 +72,10 @@ int scoutfs_client_alloc_inodes(struct super_block *sb, u64 count,
|
||||
u64 tmp;
|
||||
int ret;
|
||||
|
||||
ret = client_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_ALLOC_INODES,
|
||||
&lecount, sizeof(lecount),
|
||||
&ial, sizeof(ial));
|
||||
ret = scoutfs_net_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_ALLOC_INODES,
|
||||
&lecount, sizeof(lecount),
|
||||
&ial, sizeof(ial));
|
||||
if (ret == 0) {
|
||||
*ino = le64_to_cpu(ial.ino);
|
||||
*nr = le64_to_cpu(ial.nr);
|
||||
@@ -119,9 +94,9 @@ int scoutfs_client_get_log_trees(struct super_block *sb,
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return client_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_GET_LOG_TREES,
|
||||
NULL, 0, lt, sizeof(*lt));
|
||||
return scoutfs_net_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_GET_LOG_TREES,
|
||||
NULL, 0, lt, sizeof(*lt));
|
||||
}
|
||||
|
||||
int scoutfs_client_commit_log_trees(struct super_block *sb,
|
||||
@@ -129,9 +104,9 @@ int scoutfs_client_commit_log_trees(struct super_block *sb,
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return client_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_COMMIT_LOG_TREES,
|
||||
lt, sizeof(*lt), NULL, 0);
|
||||
return scoutfs_net_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_COMMIT_LOG_TREES,
|
||||
lt, sizeof(*lt), NULL, 0);
|
||||
}
|
||||
|
||||
int scoutfs_client_get_roots(struct super_block *sb,
|
||||
@@ -139,26 +114,9 @@ int scoutfs_client_get_roots(struct super_block *sb,
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return client_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_GET_ROOTS,
|
||||
NULL, 0, roots, sizeof(*roots));
|
||||
}
|
||||
|
||||
/*
|
||||
* Bounded-wait get_roots for the orphan scan worker. The worker
|
||||
* reschedules on error, so -ETIMEDOUT is treated like any other RPC
|
||||
* failure and retries on the next scan.
|
||||
*/
|
||||
int scoutfs_client_get_roots_timeout(struct super_block *sb,
|
||||
struct scoutfs_net_roots *roots,
|
||||
unsigned long timeout_jiffies)
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return scoutfs_net_sync_request_timeout(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_GET_ROOTS,
|
||||
NULL, 0, roots, sizeof(*roots),
|
||||
timeout_jiffies);
|
||||
return scoutfs_net_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_GET_ROOTS,
|
||||
NULL, 0, roots, sizeof(*roots));
|
||||
}
|
||||
|
||||
int scoutfs_client_get_last_seq(struct super_block *sb, u64 *seq)
|
||||
@@ -167,9 +125,9 @@ int scoutfs_client_get_last_seq(struct super_block *sb, u64 *seq)
|
||||
__le64 last_seq;
|
||||
int ret;
|
||||
|
||||
ret = client_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_GET_LAST_SEQ,
|
||||
NULL, 0, &last_seq, sizeof(last_seq));
|
||||
ret = scoutfs_net_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_GET_LAST_SEQ,
|
||||
NULL, 0, &last_seq, sizeof(last_seq));
|
||||
if (ret == 0)
|
||||
*seq = le64_to_cpu(last_seq);
|
||||
|
||||
@@ -182,34 +140,24 @@ static int client_lock_response(struct super_block *sb,
|
||||
void *resp, unsigned int resp_len,
|
||||
int error, void *data)
|
||||
{
|
||||
struct scoutfs_lock *lock = data;
|
||||
|
||||
if (error) {
|
||||
scoutfs_lock_request_failed(sb, lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (resp_len != sizeof(struct scoutfs_net_lock))
|
||||
return -EINVAL;
|
||||
|
||||
/* XXX error? */
|
||||
|
||||
return scoutfs_lock_grant_response(sb, resp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Send a lock request to the server. The lock is anchored by
|
||||
* request_pending so its address is stable until the response callback
|
||||
* runs and clears request_pending on either the grant or error path.
|
||||
*/
|
||||
/* Send a lock request to the server. */
|
||||
int scoutfs_client_lock_request(struct super_block *sb,
|
||||
struct scoutfs_net_lock *nl,
|
||||
struct scoutfs_lock *lock)
|
||||
struct scoutfs_net_lock *nl)
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return scoutfs_net_submit_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_LOCK,
|
||||
nl, sizeof(*nl),
|
||||
client_lock_response, lock, NULL);
|
||||
client_lock_response, NULL, NULL);
|
||||
}
|
||||
|
||||
/* Send a lock response to the server. */
|
||||
@@ -241,26 +189,9 @@ int scoutfs_client_srch_get_compact(struct super_block *sb,
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return client_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_SRCH_GET_COMPACT,
|
||||
NULL, 0, sc, sizeof(*sc));
|
||||
}
|
||||
|
||||
/*
|
||||
* Bounded-wait get_compact for the srch compact worker. The worker
|
||||
* reschedules on any error and the compact work is idempotent, so
|
||||
* -ETIMEDOUT just defers this round.
|
||||
*/
|
||||
int scoutfs_client_srch_get_compact_timeout(struct super_block *sb,
|
||||
struct scoutfs_srch_compact *sc,
|
||||
unsigned long timeout_jiffies)
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return scoutfs_net_sync_request_timeout(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_SRCH_GET_COMPACT,
|
||||
NULL, 0, sc, sizeof(*sc),
|
||||
timeout_jiffies);
|
||||
return scoutfs_net_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_SRCH_GET_COMPACT,
|
||||
NULL, 0, sc, sizeof(*sc));
|
||||
}
|
||||
|
||||
/* Commit the result of a srch file compaction. */
|
||||
@@ -269,27 +200,9 @@ int scoutfs_client_srch_commit_compact(struct super_block *sb,
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return client_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT,
|
||||
res, sizeof(*res), NULL, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Bounded-wait commit_compact for the srch compact worker. The server
|
||||
* ignores partial work flagged with ERROR, so a timed-out commit
|
||||
* (marked ERROR on this side) lets the server reclaim our allocators
|
||||
* and reassign the compact on the next scheduled attempt.
|
||||
*/
|
||||
int scoutfs_client_srch_commit_compact_timeout(struct super_block *sb,
|
||||
struct scoutfs_srch_compact *res,
|
||||
unsigned long timeout_jiffies)
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return scoutfs_net_sync_request_timeout(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT,
|
||||
res, sizeof(*res), NULL, 0,
|
||||
timeout_jiffies);
|
||||
return scoutfs_net_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_SRCH_COMMIT_COMPACT,
|
||||
res, sizeof(*res), NULL, 0);
|
||||
}
|
||||
|
||||
int scoutfs_client_get_log_merge(struct super_block *sb,
|
||||
@@ -297,9 +210,9 @@ int scoutfs_client_get_log_merge(struct super_block *sb,
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return client_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_GET_LOG_MERGE,
|
||||
NULL, 0, req, sizeof(*req));
|
||||
return scoutfs_net_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_GET_LOG_MERGE,
|
||||
NULL, 0, req, sizeof(*req));
|
||||
}
|
||||
|
||||
int scoutfs_client_commit_log_merge(struct super_block *sb,
|
||||
@@ -307,9 +220,9 @@ int scoutfs_client_commit_log_merge(struct super_block *sb,
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return client_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_COMMIT_LOG_MERGE,
|
||||
comp, sizeof(*comp), NULL, 0);
|
||||
return scoutfs_net_sync_request(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_COMMIT_LOG_MERGE,
|
||||
comp, sizeof(*comp), NULL, 0);
|
||||
}
|
||||
|
||||
int scoutfs_client_send_omap_response(struct super_block *sb, u64 id,
|
||||
@@ -341,30 +254,8 @@ int scoutfs_client_open_ino_map(struct super_block *sb, u64 group_nr,
|
||||
.req_id = 0,
|
||||
};
|
||||
|
||||
return client_sync_request(sb, client->conn, SCOUTFS_NET_CMD_OPEN_INO_MAP,
|
||||
&args, sizeof(args), map, sizeof(*map));
|
||||
}
|
||||
|
||||
/*
|
||||
* Bounded-wait open_ino_map for the orphan scan worker. The scan
|
||||
* reschedules on error; the delete path callers keep the unbounded
|
||||
* retry.
|
||||
*/
|
||||
int scoutfs_client_open_ino_map_timeout(struct super_block *sb, u64 group_nr,
|
||||
struct scoutfs_open_ino_map *map,
|
||||
unsigned long timeout_jiffies)
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
struct scoutfs_open_ino_map_args args = {
|
||||
.group_nr = cpu_to_le64(group_nr),
|
||||
.req_id = 0,
|
||||
};
|
||||
|
||||
return scoutfs_net_sync_request_timeout(sb, client->conn,
|
||||
SCOUTFS_NET_CMD_OPEN_INO_MAP,
|
||||
&args, sizeof(args),
|
||||
map, sizeof(*map),
|
||||
timeout_jiffies);
|
||||
return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_OPEN_INO_MAP,
|
||||
&args, sizeof(args), map, sizeof(*map));
|
||||
}
|
||||
|
||||
/* The client is asking the server for the current volume options */
|
||||
@@ -372,8 +263,8 @@ int scoutfs_client_get_volopt(struct super_block *sb, struct scoutfs_volume_opti
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return client_sync_request(sb, client->conn, SCOUTFS_NET_CMD_GET_VOLOPT,
|
||||
NULL, 0, volopt, sizeof(*volopt));
|
||||
return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_GET_VOLOPT,
|
||||
NULL, 0, volopt, sizeof(*volopt));
|
||||
}
|
||||
|
||||
/* The client is asking the server to update volume options */
|
||||
@@ -381,8 +272,8 @@ int scoutfs_client_set_volopt(struct super_block *sb, struct scoutfs_volume_opti
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return client_sync_request(sb, client->conn, SCOUTFS_NET_CMD_SET_VOLOPT,
|
||||
volopt, sizeof(*volopt), NULL, 0);
|
||||
return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_SET_VOLOPT,
|
||||
volopt, sizeof(*volopt), NULL, 0);
|
||||
}
|
||||
|
||||
/* The client is asking the server to clear volume options */
|
||||
@@ -390,24 +281,24 @@ int scoutfs_client_clear_volopt(struct super_block *sb, struct scoutfs_volume_op
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return client_sync_request(sb, client->conn, SCOUTFS_NET_CMD_CLEAR_VOLOPT,
|
||||
volopt, sizeof(*volopt), NULL, 0);
|
||||
return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_CLEAR_VOLOPT,
|
||||
volopt, sizeof(*volopt), NULL, 0);
|
||||
}
|
||||
|
||||
int scoutfs_client_resize_devices(struct super_block *sb, struct scoutfs_net_resize_devices *nrd)
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return client_sync_request(sb, client->conn, SCOUTFS_NET_CMD_RESIZE_DEVICES,
|
||||
nrd, sizeof(*nrd), NULL, 0);
|
||||
return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_RESIZE_DEVICES,
|
||||
nrd, sizeof(*nrd), NULL, 0);
|
||||
}
|
||||
|
||||
int scoutfs_client_statfs(struct super_block *sb, struct scoutfs_net_statfs *nst)
|
||||
{
|
||||
struct client_info *client = SCOUTFS_SB(sb)->client_info;
|
||||
|
||||
return client_sync_request(sb, client->conn, SCOUTFS_NET_CMD_STATFS,
|
||||
NULL, 0, nst, sizeof(*nst));
|
||||
return scoutfs_net_sync_request(sb, client->conn, SCOUTFS_NET_CMD_STATFS,
|
||||
NULL, 0, nst, sizeof(*nst));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -755,12 +646,8 @@ void scoutfs_client_destroy(struct super_block *sb)
|
||||
client_farewell_response,
|
||||
NULL, NULL);
|
||||
if (ret == 0) {
|
||||
if (!wait_for_completion_timeout(&client->farewell_comp,
|
||||
120 * HZ)) {
|
||||
ret = -ETIMEDOUT;
|
||||
} else {
|
||||
ret = client->farewell_error;
|
||||
}
|
||||
wait_for_completion(&client->farewell_comp);
|
||||
ret = client->farewell_error;
|
||||
}
|
||||
if (ret) {
|
||||
scoutfs_inc_counter(sb, client_farewell_error);
|
||||
@@ -774,16 +661,10 @@ void scoutfs_client_destroy(struct super_block *sb)
|
||||
/* make sure worker isn't using the conn */
|
||||
cancel_delayed_work_sync(&client->connect_dwork);
|
||||
|
||||
/*
|
||||
* Drain the conn's workers before nulling client->conn. In-flight
|
||||
* proc_workers dispatch request handlers that call back into client
|
||||
* response helpers (e.g. scoutfs_client_lock_recover_response) which
|
||||
* read client->conn; nulling it first races with those workers and
|
||||
* causes submit_send to dereference a NULL conn->lock.
|
||||
*/
|
||||
/* make racing conn use explode */
|
||||
conn = client->conn;
|
||||
scoutfs_net_free_conn(sb, conn);
|
||||
client->conn = NULL;
|
||||
scoutfs_net_free_conn(sb, conn);
|
||||
|
||||
if (client->workq)
|
||||
destroy_workqueue(client->workq);
|
||||
|
||||
@@ -9,28 +9,18 @@ int scoutfs_client_commit_log_trees(struct super_block *sb,
|
||||
struct scoutfs_log_trees *lt);
|
||||
int scoutfs_client_get_roots(struct super_block *sb,
|
||||
struct scoutfs_net_roots *roots);
|
||||
int scoutfs_client_get_roots_timeout(struct super_block *sb,
|
||||
struct scoutfs_net_roots *roots,
|
||||
unsigned long timeout_jiffies);
|
||||
u64 *scoutfs_client_bulk_alloc(struct super_block *sb);
|
||||
int scoutfs_client_get_last_seq(struct super_block *sb, u64 *seq);
|
||||
int scoutfs_client_lock_request(struct super_block *sb,
|
||||
struct scoutfs_net_lock *nl,
|
||||
struct scoutfs_lock *lock);
|
||||
struct scoutfs_net_lock *nl);
|
||||
int scoutfs_client_lock_response(struct super_block *sb, u64 net_id,
|
||||
struct scoutfs_net_lock *nl);
|
||||
int scoutfs_client_lock_recover_response(struct super_block *sb, u64 net_id,
|
||||
struct scoutfs_net_lock_recover *nlr);
|
||||
int scoutfs_client_srch_get_compact(struct super_block *sb,
|
||||
struct scoutfs_srch_compact *sc);
|
||||
int scoutfs_client_srch_get_compact_timeout(struct super_block *sb,
|
||||
struct scoutfs_srch_compact *sc,
|
||||
unsigned long timeout_jiffies);
|
||||
int scoutfs_client_srch_commit_compact(struct super_block *sb,
|
||||
struct scoutfs_srch_compact *res);
|
||||
int scoutfs_client_srch_commit_compact_timeout(struct super_block *sb,
|
||||
struct scoutfs_srch_compact *res,
|
||||
unsigned long timeout_jiffies);
|
||||
int scoutfs_client_get_log_merge(struct super_block *sb,
|
||||
struct scoutfs_log_merge_request *req);
|
||||
int scoutfs_client_commit_log_merge(struct super_block *sb,
|
||||
@@ -39,9 +29,6 @@ int scoutfs_client_send_omap_response(struct super_block *sb, u64 id,
|
||||
struct scoutfs_open_ino_map *map);
|
||||
int scoutfs_client_open_ino_map(struct super_block *sb, u64 group_nr,
|
||||
struct scoutfs_open_ino_map *map);
|
||||
int scoutfs_client_open_ino_map_timeout(struct super_block *sb, u64 group_nr,
|
||||
struct scoutfs_open_ino_map *map,
|
||||
unsigned long timeout_jiffies);
|
||||
int scoutfs_client_get_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
|
||||
int scoutfs_client_set_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
|
||||
int scoutfs_client_clear_volopt(struct super_block *sb, struct scoutfs_volume_options *volopt);
|
||||
|
||||
@@ -62,7 +62,6 @@
|
||||
EXPAND_COUNTER(btree_walk) \
|
||||
EXPAND_COUNTER(btree_walk_restart) \
|
||||
EXPAND_COUNTER(client_farewell_error) \
|
||||
EXPAND_COUNTER(client_rpc_timeout) \
|
||||
EXPAND_COUNTER(corrupt_btree_block_level) \
|
||||
EXPAND_COUNTER(corrupt_btree_no_child_ref) \
|
||||
EXPAND_COUNTER(corrupt_dirent_backref_name_len) \
|
||||
@@ -139,7 +138,6 @@
|
||||
EXPAND_COUNTER(lock_lock_error) \
|
||||
EXPAND_COUNTER(lock_nonblock_eagain) \
|
||||
EXPAND_COUNTER(lock_recover_request) \
|
||||
EXPAND_COUNTER(lock_request_failed) \
|
||||
EXPAND_COUNTER(lock_shrink_attempted) \
|
||||
EXPAND_COUNTER(lock_shrink_request_failed) \
|
||||
EXPAND_COUNTER(lock_unlock) \
|
||||
|
||||
@@ -114,6 +114,42 @@ static struct scoutfs_block *read_bloom_ref(struct super_block *sb, struct scout
|
||||
return bl;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns >0 if there was a bloom block and all the bits were present.
|
||||
*/
|
||||
static int all_bloom_bits_present(struct super_block *sb, struct scoutfs_block_ref *ref,
|
||||
struct forest_bloom_nrs *bloom)
|
||||
{
|
||||
struct scoutfs_bloom_block *bb;
|
||||
struct scoutfs_block *bl;
|
||||
int i;
|
||||
|
||||
if (ref->blkno == 0)
|
||||
return 0;
|
||||
|
||||
bl = read_bloom_ref(sb, ref);
|
||||
if (IS_ERR(bl))
|
||||
return PTR_ERR(bl);
|
||||
|
||||
bb = bl->data;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(bloom->nrs); i++) {
|
||||
if (!test_bit_le(bloom->nrs[i], bb->bits))
|
||||
break;
|
||||
}
|
||||
|
||||
scoutfs_block_put(sb, bl);
|
||||
|
||||
/* one of the bloom bits wasn't set */
|
||||
if (i != ARRAY_SIZE(bloom->nrs)) {
|
||||
scoutfs_inc_counter(sb, forest_bloom_fail);
|
||||
return 0;
|
||||
}
|
||||
|
||||
scoutfs_inc_counter(sb, forest_bloom_pass);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is an unlocked iteration across all the btrees to find a hint at
|
||||
* the next key that the caller could read. It's used to find out what
|
||||
@@ -227,9 +263,13 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key, u6
|
||||
}
|
||||
|
||||
/*
|
||||
* For each forest btree whose bloom block indicates that the lock might
|
||||
* have items stored, call the caller's callback for every item in the
|
||||
* leaf block in each tree which contains the key.
|
||||
* Call the caller's callback for every item in the leaf blocks in each
|
||||
* forest btree that contain the caller's key.
|
||||
*
|
||||
* If a bloom key is provided then each log tree's bloom block is
|
||||
* checked and only trees with all the bloom key's bloom bits set will
|
||||
* be read from. When the bloom key is null all trees will be read
|
||||
* from.
|
||||
*
|
||||
* The btree iter calls clamp the caller's range to the tightest range
|
||||
* that covers all the blocks. Any keys outside of this range can't be
|
||||
@@ -248,24 +288,17 @@ int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_r
|
||||
.cb_arg = arg,
|
||||
};
|
||||
struct scoutfs_log_trees lt;
|
||||
struct scoutfs_bloom_block *bb;
|
||||
struct forest_bloom_nrs bloom;
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
struct scoutfs_block *bl;
|
||||
struct scoutfs_key ltk;
|
||||
struct scoutfs_key orig_start = *start;
|
||||
struct scoutfs_key orig_end = *end;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
scoutfs_inc_counter(sb, forest_read_items);
|
||||
calc_bloom_nrs(&bloom, bloom_key);
|
||||
if (bloom_key)
|
||||
calc_bloom_nrs(&bloom, bloom_key);
|
||||
|
||||
trace_scoutfs_forest_using_roots(sb, &roots->fs_root, &roots->logs_root);
|
||||
|
||||
*start = orig_start;
|
||||
*end = orig_end;
|
||||
|
||||
/* start with fs root items */
|
||||
rid.fic |= FIC_FS_ROOT;
|
||||
ret = scoutfs_btree_read_items(sb, &roots->fs_root, key, start, end,
|
||||
@@ -292,30 +325,17 @@ int scoutfs_forest_read_items_roots(struct super_block *sb, struct scoutfs_net_r
|
||||
goto out; /* including stale */
|
||||
}
|
||||
|
||||
if (lt.bloom_ref.blkno == 0)
|
||||
/* we're not expecting -ENOENT from _read_items */
|
||||
if (lt.item_root.ref.blkno == 0)
|
||||
continue;
|
||||
|
||||
bl = read_bloom_ref(sb, <.bloom_ref);
|
||||
if (IS_ERR(bl)) {
|
||||
ret = PTR_ERR(bl);
|
||||
goto out;
|
||||
if (bloom_key) {
|
||||
ret = all_bloom_bits_present(sb, <.bloom_ref, &bloom);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
if (ret == 0)
|
||||
continue;
|
||||
}
|
||||
bb = bl->data;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(bloom.nrs); i++) {
|
||||
if (!test_bit_le(bloom.nrs[i], bb->bits))
|
||||
break;
|
||||
}
|
||||
|
||||
scoutfs_block_put(sb, bl);
|
||||
|
||||
/* one of the bloom bits wasn't set */
|
||||
if (i != ARRAY_SIZE(bloom.nrs)) {
|
||||
scoutfs_inc_counter(sb, forest_bloom_fail);
|
||||
continue;
|
||||
}
|
||||
|
||||
scoutfs_inc_counter(sb, forest_bloom_pass);
|
||||
|
||||
if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED) &&
|
||||
(merge_input_seq == 0 ||
|
||||
|
||||
@@ -2074,14 +2074,6 @@ void scoutfs_inode_schedule_orphan_dwork(struct super_block *sb)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Generous per-RPC bound for the idempotent orphan scan worker. A
|
||||
* server that hasn't answered in this long is assumed to be broken;
|
||||
* dropping the request lets the scan reschedule instead of blocking
|
||||
* forever.
|
||||
*/
|
||||
#define ORPHAN_SCAN_RPC_TIMEOUT (5 * 60 * HZ)
|
||||
|
||||
/*
|
||||
* Find and delete inodes whose only remaining reference is the
|
||||
* persistent orphan item that was created as they were unlinked.
|
||||
@@ -2136,7 +2128,7 @@ static void inode_orphan_scan_worker(struct work_struct *work)
|
||||
init_orphan_key(&last, U64_MAX);
|
||||
omap.args.group_nr = cpu_to_le64(U64_MAX);
|
||||
|
||||
ret = scoutfs_client_get_roots_timeout(sb, &roots, ORPHAN_SCAN_RPC_TIMEOUT);
|
||||
ret = scoutfs_client_get_roots(sb, &roots);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -2177,8 +2169,7 @@ static void inode_orphan_scan_worker(struct work_struct *work)
|
||||
scoutfs_omap_calc_group_nrs(ino, &group_nr, &bit_nr);
|
||||
|
||||
if (le64_to_cpu(omap.args.group_nr) != group_nr) {
|
||||
ret = scoutfs_client_open_ino_map_timeout(sb, group_nr, &omap,
|
||||
ORPHAN_SCAN_RPC_TIMEOUT);
|
||||
ret = scoutfs_client_open_ino_map(sb, group_nr, &omap);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -49,6 +49,7 @@
|
||||
#include "quota.h"
|
||||
#include "scoutfs_trace.h"
|
||||
#include "util.h"
|
||||
#include "raw.h"
|
||||
|
||||
/*
|
||||
* We make inode index items coherent by locking fixed size regions of
|
||||
@@ -1739,6 +1740,69 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static long scoutfs_ioc_raw_read_meta_seq(struct file *file, unsigned long arg)
|
||||
{
|
||||
struct super_block *sb = file_inode(file)->i_sb;
|
||||
struct scoutfs_ioctl_raw_read_meta_seq __user *urms = (void __user *)arg;
|
||||
struct scoutfs_ioctl_raw_read_meta_seq rms;
|
||||
int ret;
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN)) {
|
||||
ret = -EPERM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (copy_from_user(&rms, urms, sizeof(rms))) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (rms.results_size == 0) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (rms.results_size < sizeof(struct scoutfs_ioctl_meta_seq) ||
|
||||
rms.results_size > INT_MAX) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_raw_read_meta_seq(sb, &rms, &rms.last);
|
||||
if (ret >= 0 && copy_to_user(&urms->last, &rms.last, sizeof(rms.last)))
|
||||
ret = -EFAULT;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static long scoutfs_ioc_raw_read_inode_info(struct file *file, unsigned long arg)
|
||||
{
|
||||
struct super_block *sb = file_inode(file)->i_sb;
|
||||
struct scoutfs_ioctl_raw_read_inode_info __user *urii = (void __user *)arg;
|
||||
struct scoutfs_ioctl_raw_read_inode_info rii;
|
||||
int ret;
|
||||
|
||||
if (!capable(CAP_SYS_ADMIN)) {
|
||||
ret = -EPERM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (copy_from_user(&rii, urii, sizeof(rii))) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (rii.inos_count == 0 || rii.results_size > INT_MAX ||
|
||||
!IS_ALIGNED(rii.inos_ptr, __alignof__(__u64))) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_raw_read_inode_info(sb, &rii);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
switch (cmd) {
|
||||
@@ -1790,6 +1854,10 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
return scoutfs_ioc_read_xattr_index(file, arg);
|
||||
case SCOUTFS_IOC_PUNCH_OFFLINE:
|
||||
return scoutfs_ioc_punch_offline(file, arg);
|
||||
case SCOUTFS_IOC_RAW_READ_META_SEQ:
|
||||
return scoutfs_ioc_raw_read_meta_seq(file, arg);
|
||||
case SCOUTFS_IOC_RAW_READ_INODE_INFO:
|
||||
return scoutfs_ioc_raw_read_inode_info(file, arg);
|
||||
}
|
||||
|
||||
return -ENOTTY;
|
||||
|
||||
179
kmod/src/ioctl.h
179
kmod/src/ioctl.h
@@ -15,20 +15,6 @@
|
||||
|
||||
#define SCOUTFS_IOCTL_MAGIC 0xE8 /* arbitrarily chosen hole in ioctl-number.rst */
|
||||
|
||||
/*
|
||||
* Packed scoutfs keys rarely cross the ioctl boundary so we have a
|
||||
* translation struct.
|
||||
*/
|
||||
struct scoutfs_ioctl_key {
|
||||
__le64 _sk_first;
|
||||
__le64 _sk_second;
|
||||
__le64 _sk_third;
|
||||
__u8 _sk_fourth;
|
||||
__u8 sk_type;
|
||||
__u8 sk_zone;
|
||||
__u8 _pad[5];
|
||||
};
|
||||
|
||||
struct scoutfs_ioctl_walk_inodes_entry {
|
||||
__u64 major;
|
||||
__u64 ino;
|
||||
@@ -876,4 +862,169 @@ struct scoutfs_ioctl_punch_offline {
|
||||
#define SCOUTFS_IOC_PUNCH_OFFLINE \
|
||||
_IOW(SCOUTFS_IOCTL_MAGIC, 24, struct scoutfs_ioctl_punch_offline)
|
||||
|
||||
/*
|
||||
* Read meta_seq items without cluster locking.
|
||||
*
|
||||
* @start is the first meta_seq item value that could be returned.
|
||||
* {0,0} is the minimum.
|
||||
*
|
||||
* @end is the last meta_seq item value that could be returned.
|
||||
* {U64_MAX, U64_MAX} is the maximum.
|
||||
*
|
||||
* @last is only set on success from the call. It's the last meta_seq
|
||||
* item that could have been returned. This lets the caller detect that
|
||||
* the full input range wasn't explored. Another call can be made with
|
||||
* start set to just after this.
|
||||
*
|
||||
* @results_ptr is a pointer to an array of (struct
|
||||
* scoutfs_ioctl_meta_seq) elements that were found in the input range.
|
||||
*
|
||||
* @results_size is the count of elements in the results_ptr array and
|
||||
* the maximum number of results that can be returned. There must be
|
||||
* room for at least one result.
|
||||
*
|
||||
* Return existing meta_seq items starting from @start until @last.
|
||||
* Partial results can be returned and is indicated by @last being set
|
||||
* to an item before @last.
|
||||
*
|
||||
* The results are sorted first by increasing meta_seq and then by
|
||||
* increasing ino. All of the results are from one version of file
|
||||
* system metadata. This means that an inode can not be found multiple
|
||||
* times within the results of one call.
|
||||
*
|
||||
* This call ignores currently dirty transactions and reads persistent
|
||||
* items directly. A transaction can be written after this call and
|
||||
* cause meta_seq items to appear before or within the results from this
|
||||
* call.
|
||||
*
|
||||
* The number of meta_seq items stored in the results buffer is returned
|
||||
* and @last is updated. 0 items can be returned if none are found
|
||||
* within the input range.
|
||||
*
|
||||
* Unique errors:
|
||||
*
|
||||
* -EINVAL: The result count was 0 or greater than INT_MAX.
|
||||
*
|
||||
* -ESTALE: The results could not be read from one stable version of
|
||||
* file system metadata. Decrease the number of inodes requested.
|
||||
*/
|
||||
struct scoutfs_ioctl_meta_seq {
|
||||
__u64 meta_seq;
|
||||
__u64 ino;
|
||||
};
|
||||
struct scoutfs_ioctl_raw_read_meta_seq {
|
||||
struct scoutfs_ioctl_meta_seq start;
|
||||
struct scoutfs_ioctl_meta_seq end;
|
||||
struct scoutfs_ioctl_meta_seq last;
|
||||
__u64 results_ptr;
|
||||
__u32 results_size;
|
||||
__u32 _pad;
|
||||
};
|
||||
#define SCOUTFS_IOC_RAW_READ_META_SEQ \
|
||||
_IOR(SCOUTFS_IOCTL_MAGIC, 25, struct scoutfs_ioctl_raw_read_meta_seq)
|
||||
|
||||
|
||||
/*
|
||||
* Read inode metadata without cluster locking.
|
||||
*
|
||||
* @inos_ptr is a pointer to an aligned array of 64bit inode numbers.
|
||||
*
|
||||
* @inos_count is the number of elements in the array. The inode
|
||||
* numbers must not be zero, must strictly increase, and must not
|
||||
* contain any duplicates.
|
||||
*
|
||||
* @names_ptr is a pointer to a byte array of xattr names to return with
|
||||
* each inode. The names are identical to those used in
|
||||
* {get,set}xattr(2). The names must be null terminated and no two
|
||||
* names may be equal.
|
||||
*
|
||||
* @names_count is the number of names that will be found in the
|
||||
* names_ptr buffer.
|
||||
*
|
||||
* @results_ptr is a pointer to a buffer that will be filled by the read
|
||||
* inode info results. The result structs and payloads are not aligned.
|
||||
* Callers will almost certainly need to copy them into aligned
|
||||
* addresses before referencing their contents.
|
||||
*
|
||||
* @results_size is the number of bytes available in the results_ptr
|
||||
* buffer.
|
||||
*
|
||||
* For each inode an _INODE result will always be returned. Then a
|
||||
* _XATTR result will be returned for each xattr on the inode that
|
||||
* matches one of the given input names.
|
||||
*
|
||||
* Each call will not return partial results. -ERANGE is returned if the
|
||||
* results for the requested inodes do not fit in the results buffer.
|
||||
*
|
||||
* The info for one call is from one consistent version of the file
|
||||
* system metadata. The call can have to retry if it sees metadata
|
||||
* change during its call. -ESTALE will be returned if it was not able
|
||||
* to read all the inodes info from one metadata version. The number of
|
||||
* inodes being read can be decreased to avoid this.
|
||||
*
|
||||
* Inodes with an nlink of 0 are not returned.
|
||||
*
|
||||
* The size in bytes of filled results is returned. A non-zero return
|
||||
* will always include at least one full
|
||||
* (struct scoutfs_ioctl_raw_read_result) header.
|
||||
*
|
||||
* Unique errors:
|
||||
*
|
||||
* -EINVAL: The inode count can't be zero. The inos ptr must be aligned
|
||||
* to __u64 alignment. The results buffer size can't be larger than
|
||||
* INT_MAX. Inode numbers can't be zero, must be sorted, and can't
|
||||
* have duplicates. The xattr names must be unique, null terminated,
|
||||
* and less than 256 bytes long.
|
||||
*
|
||||
* -ERANGE: The results for the requested inodes do not fit in the
|
||||
* results buffer. Increase the buffer size (perhaps allowing for all
|
||||
* xattrs with large values) or decrease the number of inodes per call.
|
||||
*
|
||||
* -ESTALE: The results could not be read from one stable version of
|
||||
* file system metadata. Decrease the number of inodes requested.
|
||||
*
|
||||
* -EUCLEAN: Internal xattr metadata is inconsistent.
|
||||
*/
|
||||
|
||||
struct scoutfs_ioctl_raw_read_inode_info {
|
||||
__u64 inos_ptr;
|
||||
__u32 inos_count;
|
||||
__u32 names_count;
|
||||
__u64 names_ptr;
|
||||
__u64 results_ptr;
|
||||
__u32 results_size;
|
||||
__u8 _pad[4];
|
||||
};
|
||||
|
||||
/*
|
||||
* @type is one of the enums that determines the type of the following
|
||||
* result payload.
|
||||
*
|
||||
* @size is the number of bytes of result payload immediately following
|
||||
* the result struct. It does not include the size of the result struct
|
||||
* header.
|
||||
*/
|
||||
struct scoutfs_ioctl_raw_read_result {
|
||||
__u32 size;
|
||||
__u8 _pad[7];
|
||||
__u8 type;
|
||||
};
|
||||
|
||||
/*
|
||||
* The _INODE result contains an initial 64bit inode number followed by a
|
||||
* struct scoutfs_inode as defined in format.h. The size includes the
|
||||
* 8byte initial inode number. With that subtracted the size of the
|
||||
* inode struct defines its version (and so the fields it supports).
|
||||
*/
|
||||
#define SCOUTFS_IOC_RAW_READ_RESULT_INODE 1
|
||||
/*
|
||||
* The result payload contains the null terminated name and the value.
|
||||
* The value size can be found by subtracting the null terminated name
|
||||
* length from the result size.
|
||||
*/
|
||||
#define SCOUTFS_IOC_RAW_READ_RESULT_XATTR 2
|
||||
|
||||
#define SCOUTFS_IOC_RAW_READ_INODE_INFO \
|
||||
_IOR(SCOUTFS_IOCTL_MAGIC, 25, struct scoutfs_ioctl_raw_read_inode_info)
|
||||
|
||||
#endif
|
||||
|
||||
124
kmod/src/lock.c
124
kmod/src/lock.c
@@ -71,8 +71,6 @@
|
||||
* relative to that lock state we resend.
|
||||
*/
|
||||
|
||||
#define CLIENT_LOCK_WAIT_TIMEOUT (60 * HZ)
|
||||
|
||||
/*
|
||||
* allocated per-super, freed on unmount.
|
||||
*/
|
||||
@@ -159,33 +157,6 @@ static void invalidate_inode(struct super_block *sb, u64 ino)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove all coverage items from the lock to tell users that their
|
||||
* cache is stale. This is lock-internal bookkeeping that is safe to
|
||||
* call during shutdown and unmount. The unconditional unlock/relock
|
||||
* of cov_list_lock avoids sparse warnings from unbalanced locking in
|
||||
* the trylock failure path.
|
||||
*/
|
||||
static void lock_clear_coverage(struct super_block *sb,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
struct scoutfs_lock_coverage *cov;
|
||||
|
||||
spin_lock(&lock->cov_list_lock);
|
||||
while ((cov = list_first_entry_or_null(&lock->cov_list,
|
||||
struct scoutfs_lock_coverage, head))) {
|
||||
if (spin_trylock(&cov->cov_lock)) {
|
||||
list_del_init(&cov->head);
|
||||
cov->lock = NULL;
|
||||
spin_unlock(&cov->cov_lock);
|
||||
scoutfs_inc_counter(sb, lock_invalidate_coverage);
|
||||
}
|
||||
spin_unlock(&lock->cov_list_lock);
|
||||
spin_lock(&lock->cov_list_lock);
|
||||
}
|
||||
spin_unlock(&lock->cov_list_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Invalidate caches associated with this lock. Either we're
|
||||
* invalidating a write to a read or we're invalidating to null. We
|
||||
@@ -195,6 +166,7 @@ static void lock_clear_coverage(struct super_block *sb,
|
||||
static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock,
|
||||
enum scoutfs_lock_mode prev, enum scoutfs_lock_mode mode)
|
||||
{
|
||||
struct scoutfs_lock_coverage *cov;
|
||||
u64 ino, last;
|
||||
int ret = 0;
|
||||
|
||||
@@ -218,7 +190,24 @@ static int lock_invalidate(struct super_block *sb, struct scoutfs_lock *lock,
|
||||
|
||||
/* have to invalidate if we're not in the only usable case */
|
||||
if (!(prev == SCOUTFS_LOCK_WRITE && mode == SCOUTFS_LOCK_READ)) {
|
||||
lock_clear_coverage(sb, lock);
|
||||
/*
|
||||
* Remove cov items to tell users that their cache is
|
||||
* stale. The unlock pattern comes from avoiding bad
|
||||
* sparse warnings when taking else in a failed trylock.
|
||||
*/
|
||||
spin_lock(&lock->cov_list_lock);
|
||||
while ((cov = list_first_entry_or_null(&lock->cov_list,
|
||||
struct scoutfs_lock_coverage, head))) {
|
||||
if (spin_trylock(&cov->cov_lock)) {
|
||||
list_del_init(&cov->head);
|
||||
cov->lock = NULL;
|
||||
spin_unlock(&cov->cov_lock);
|
||||
scoutfs_inc_counter(sb, lock_invalidate_coverage);
|
||||
}
|
||||
spin_unlock(&lock->cov_list_lock);
|
||||
spin_lock(&lock->cov_list_lock);
|
||||
}
|
||||
spin_unlock(&lock->cov_list_lock);
|
||||
|
||||
/* invalidate inodes after removing coverage so drop/evict aren't covered */
|
||||
if (lock->start.sk_zone == SCOUTFS_FS_ZONE) {
|
||||
@@ -654,33 +643,6 @@ int scoutfs_lock_grant_response(struct super_block *sb,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* The lock request we sent to the server was dropped before we could
|
||||
* receive a grant response. This happens when the client reconnects to
|
||||
* a new server and completes pending requests with an error, since the
|
||||
* old server's pending-request state was reclaimed at fence time.
|
||||
*
|
||||
* Clear request_pending so that a waiter in lock_key_range re-evaluates
|
||||
* and sends a fresh request to the new server, and symmetrically put
|
||||
* the lock so shrink's lru state matches the grant_response path.
|
||||
*/
|
||||
void scoutfs_lock_request_failed(struct super_block *sb,
|
||||
struct scoutfs_lock *lock)
|
||||
{
|
||||
DECLARE_LOCK_INFO(sb, linfo);
|
||||
|
||||
scoutfs_inc_counter(sb, lock_request_failed);
|
||||
|
||||
spin_lock(&linfo->lock);
|
||||
|
||||
BUG_ON(!lock->request_pending);
|
||||
lock->request_pending = 0;
|
||||
wake_up(&lock->waitq);
|
||||
put_lock(linfo, lock);
|
||||
|
||||
spin_unlock(&linfo->lock);
|
||||
}
|
||||
|
||||
struct inv_req {
|
||||
struct list_head head;
|
||||
struct scoutfs_lock *lock;
|
||||
@@ -752,13 +714,10 @@ static void lock_invalidate_worker(struct work_struct *work)
|
||||
ireq = list_first_entry(&lock->inv_list, struct inv_req, head);
|
||||
nl = &ireq->nl;
|
||||
|
||||
/* only lock protocol, inv can't call subsystems after shutdown or unmount */
|
||||
if (!linfo->shutdown && !scoutfs_unmounting(sb)) {
|
||||
/* only lock protocol, inv can't call subsystems after shutdown */
|
||||
if (!linfo->shutdown) {
|
||||
ret = lock_invalidate(sb, lock, nl->old_mode, nl->new_mode);
|
||||
BUG_ON(ret < 0 && ret != -ENOLINK);
|
||||
} else {
|
||||
lock_clear_coverage(sb, lock);
|
||||
scoutfs_item_invalidate(sb, &lock->start, &lock->end);
|
||||
}
|
||||
|
||||
/* respond with the key and modes from the request, server might have died */
|
||||
@@ -963,7 +922,7 @@ static bool try_shrink_lock(struct super_block *sb, struct lock_info *linfo, boo
|
||||
spin_unlock(&linfo->lock);
|
||||
|
||||
if (lock) {
|
||||
ret = scoutfs_client_lock_request(sb, &nl, lock);
|
||||
ret = scoutfs_client_lock_request(sb, &nl);
|
||||
if (ret < 0) {
|
||||
scoutfs_inc_counter(sb, lock_shrink_request_failed);
|
||||
|
||||
@@ -994,9 +953,6 @@ static bool lock_wait_cond(struct super_block *sb, struct scoutfs_lock *lock,
|
||||
!lock->request_pending;
|
||||
spin_unlock(&linfo->lock);
|
||||
|
||||
if (!wake)
|
||||
wake = scoutfs_unmounting(sb);
|
||||
|
||||
if (!wake)
|
||||
scoutfs_inc_counter(sb, lock_wait);
|
||||
|
||||
@@ -1041,10 +997,8 @@ static int lock_key_range(struct super_block *sb, enum scoutfs_lock_mode mode, i
|
||||
return -EINVAL;
|
||||
|
||||
/* maybe catch _setup() and _shutdown order mistakes */
|
||||
if (!linfo || linfo->shutdown) {
|
||||
WARN_ON_ONCE(!scoutfs_unmounting(sb));
|
||||
if (WARN_ON_ONCE(!linfo || linfo->shutdown))
|
||||
return -ENOLCK;
|
||||
}
|
||||
|
||||
/* have to lock before entering transactions */
|
||||
if (WARN_ON_ONCE(scoutfs_trans_held()))
|
||||
@@ -1070,11 +1024,6 @@ static int lock_key_range(struct super_block *sb, enum scoutfs_lock_mode mode, i
|
||||
break;
|
||||
}
|
||||
|
||||
if (scoutfs_unmounting(sb)) {
|
||||
ret = -ESHUTDOWN;
|
||||
break;
|
||||
}
|
||||
|
||||
/* the fast path where we can use the granted mode */
|
||||
if (lock_modes_match(lock->mode, mode)) {
|
||||
lock_inc_count(lock->users, mode);
|
||||
@@ -1104,7 +1053,7 @@ static int lock_key_range(struct super_block *sb, enum scoutfs_lock_mode mode, i
|
||||
nl.old_mode = lock->mode;
|
||||
nl.new_mode = mode;
|
||||
|
||||
ret = scoutfs_client_lock_request(sb, &nl, lock);
|
||||
ret = scoutfs_client_lock_request(sb, &nl);
|
||||
if (ret) {
|
||||
spin_lock(&linfo->lock);
|
||||
lock->request_pending = 0;
|
||||
@@ -1118,9 +1067,8 @@ static int lock_key_range(struct super_block *sb, enum scoutfs_lock_mode mode, i
|
||||
if (flags & SCOUTFS_LKF_INTERRUPTIBLE) {
|
||||
ret = wait_event_interruptible(lock->waitq,
|
||||
lock_wait_cond(sb, lock, mode));
|
||||
} else if (!wait_event_timeout(lock->waitq,
|
||||
lock_wait_cond(sb, lock, mode),
|
||||
CLIENT_LOCK_WAIT_TIMEOUT)) {
|
||||
} else {
|
||||
wait_event(lock->waitq, lock_wait_cond(sb, lock, mode));
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
@@ -1145,19 +1093,24 @@ out_unlock:
|
||||
return ret;
|
||||
}
|
||||
|
||||
void scoutfs_lock_get_fs_item_range(u64 ino, struct scoutfs_key *start, struct scoutfs_key *end)
|
||||
{
|
||||
scoutfs_key_set_zeros(start);
|
||||
start->sk_zone = SCOUTFS_FS_ZONE;
|
||||
start->ski_ino = cpu_to_le64(ino & ~(u64)SCOUTFS_LOCK_INODE_GROUP_MASK);
|
||||
|
||||
scoutfs_key_set_ones(end);
|
||||
end->sk_zone = SCOUTFS_FS_ZONE;
|
||||
end->ski_ino = cpu_to_le64(ino | SCOUTFS_LOCK_INODE_GROUP_MASK);
|
||||
}
|
||||
|
||||
int scoutfs_lock_ino(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
|
||||
struct scoutfs_lock **ret_lock)
|
||||
{
|
||||
struct scoutfs_key start;
|
||||
struct scoutfs_key end;
|
||||
|
||||
scoutfs_key_set_zeros(&start);
|
||||
start.sk_zone = SCOUTFS_FS_ZONE;
|
||||
start.ski_ino = cpu_to_le64(ino & ~(u64)SCOUTFS_LOCK_INODE_GROUP_MASK);
|
||||
|
||||
scoutfs_key_set_ones(&end);
|
||||
end.sk_zone = SCOUTFS_FS_ZONE;
|
||||
end.ski_ino = cpu_to_le64(ino | SCOUTFS_LOCK_INODE_GROUP_MASK);
|
||||
scoutfs_lock_get_fs_item_range(ino, &start, &end);
|
||||
|
||||
return lock_key_range(sb, mode, flags, &start, &end, ret_lock);
|
||||
}
|
||||
@@ -1702,7 +1655,6 @@ void scoutfs_lock_destroy(struct super_block *sb)
|
||||
list_del_init(&lock->inv_head);
|
||||
lock->invalidate_pending = 0;
|
||||
}
|
||||
lock_clear_coverage(sb, lock);
|
||||
lock_remove(linfo, lock);
|
||||
lock_free(linfo, lock);
|
||||
}
|
||||
|
||||
@@ -60,13 +60,12 @@ struct scoutfs_lock_coverage {
|
||||
|
||||
int scoutfs_lock_grant_response(struct super_block *sb,
|
||||
struct scoutfs_net_lock *nl);
|
||||
void scoutfs_lock_request_failed(struct super_block *sb,
|
||||
struct scoutfs_lock *lock);
|
||||
int scoutfs_lock_invalidate_request(struct super_block *sb, u64 net_id,
|
||||
struct scoutfs_net_lock *nl);
|
||||
int scoutfs_lock_recover_request(struct super_block *sb, u64 net_id,
|
||||
struct scoutfs_key *key);
|
||||
|
||||
void scoutfs_lock_get_fs_item_range(u64 ino, struct scoutfs_key *start, struct scoutfs_key *end);
|
||||
int scoutfs_lock_inode(struct super_block *sb, enum scoutfs_lock_mode mode, int flags,
|
||||
struct inode *inode, struct scoutfs_lock **ret_lock);
|
||||
int scoutfs_lock_ino(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino,
|
||||
|
||||
145
kmod/src/net.c
145
kmod/src/net.c
@@ -1750,10 +1750,8 @@ void scoutfs_net_client_greeting(struct super_block *sb,
|
||||
bool new_server)
|
||||
{
|
||||
struct net_info *ninf = SCOUTFS_SB(sb)->net_info;
|
||||
scoutfs_net_response_t resp_func;
|
||||
struct message_send *msend;
|
||||
struct message_send *tmp;
|
||||
void *resp_data;
|
||||
|
||||
/* only called on client connections :/ */
|
||||
BUG_ON(conn->listening_conn);
|
||||
@@ -1762,32 +1760,10 @@ void scoutfs_net_client_greeting(struct super_block *sb,
|
||||
|
||||
if (new_server) {
|
||||
atomic64_set(&conn->recv_seq, 0);
|
||||
|
||||
/* drop stale responses; old server's state is gone */
|
||||
list_for_each_entry_safe(msend, tmp, &conn->resend_queue, head){
|
||||
if (nh_is_response(&msend->nh))
|
||||
free_msend(ninf, conn, msend);
|
||||
}
|
||||
|
||||
/*
|
||||
* Complete pending requests with -ECONNRESET. Any state
|
||||
* they depended on in the old server was reclaimed at
|
||||
* fence time, so resending is wrong. Callers re-issue on
|
||||
* the new server if they still care.
|
||||
*/
|
||||
while ((msend = list_first_entry_or_null(&conn->resend_queue,
|
||||
struct message_send, head))) {
|
||||
if (nh_is_response(&msend->nh))
|
||||
break;
|
||||
resp_func = msend->resp_func;
|
||||
resp_data = msend->resp_data;
|
||||
free_msend(ninf, conn, msend);
|
||||
spin_unlock(&conn->lock);
|
||||
|
||||
call_resp_func(sb, conn, resp_func, resp_data, NULL, 0, -ECONNRESET);
|
||||
|
||||
spin_lock(&conn->lock);
|
||||
}
|
||||
}
|
||||
|
||||
set_valid_greeting(conn);
|
||||
@@ -2014,9 +1990,8 @@ static int sync_response(struct super_block *sb,
|
||||
* buffer. Errors returned can come from the remote request processing
|
||||
* or local failure to send.
|
||||
*
|
||||
* The wait for the response uses a 60 second timeout loop that
|
||||
* checks for unmount, returning -ESHUTDOWN if the mount is
|
||||
* being torn down.
|
||||
* The wait for the response is interruptible and can return
|
||||
* -ERESTARTSYS if it is interrupted.
|
||||
*
|
||||
* -EOVERFLOW is returned if the response message's data_length doesn't
|
||||
* match the caller's resp_len buffer.
|
||||
@@ -2027,7 +2002,6 @@ int scoutfs_net_sync_request(struct super_block *sb,
|
||||
void *resp, size_t resp_len)
|
||||
{
|
||||
struct sync_request_completion sreq;
|
||||
struct message_send *msend;
|
||||
int ret;
|
||||
u64 id;
|
||||
|
||||
@@ -2040,124 +2014,13 @@ int scoutfs_net_sync_request(struct super_block *sb,
|
||||
sync_response, &sreq, &id);
|
||||
|
||||
if (ret == 0) {
|
||||
while (!wait_for_completion_timeout(&sreq.comp, 60 * HZ)) {
|
||||
if (scoutfs_unmounting(sb)) {
|
||||
ret = -ESHUTDOWN;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ret == -ESHUTDOWN) {
|
||||
spin_lock(&conn->lock);
|
||||
msend = find_request(conn, cmd, id);
|
||||
if (msend)
|
||||
queue_dead_free(conn, msend);
|
||||
spin_unlock(&conn->lock);
|
||||
} else {
|
||||
ret = sreq.error;
|
||||
}
|
||||
wait_for_completion(&sreq.comp);
|
||||
ret = sreq.error;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* A bounded-wait variant of sync_request for idempotent background
|
||||
* workers that must reschedule instead of blocking indefinitely on an
|
||||
* unresponsive server. Returns -ETIMEDOUT if the response doesn't
|
||||
* arrive within timeout_jiffies; the caller then treats it like any
|
||||
* other RPC failure and retries on its normal reschedule cadence.
|
||||
*
|
||||
* Response state lives in a refcounted heap allocation rather than on
|
||||
* the caller's stack so a late callback can't scribble into freed
|
||||
* memory if we give up waiting. On timeout we race with an arriving
|
||||
* response for the msend: if find_request wins we queue_dead_free and
|
||||
* the callback won't fire (we drop its ref); otherwise the callback is
|
||||
* already running so we wait for it to complete before returning.
|
||||
*/
|
||||
struct bounded_sync {
|
||||
struct completion comp;
|
||||
void *resp;
|
||||
unsigned int resp_len;
|
||||
int error;
|
||||
atomic_t refs;
|
||||
};
|
||||
|
||||
static void bounded_sync_put(struct bounded_sync *bs)
|
||||
{
|
||||
if (atomic_dec_and_test(&bs->refs))
|
||||
kfree(bs);
|
||||
}
|
||||
|
||||
static int bounded_sync_response(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
void *resp, unsigned int resp_len,
|
||||
int error, void *data)
|
||||
{
|
||||
struct bounded_sync *bs = data;
|
||||
|
||||
if (error == 0 && resp_len != bs->resp_len)
|
||||
error = -EMSGSIZE;
|
||||
|
||||
if (error)
|
||||
bs->error = error;
|
||||
else if (resp_len)
|
||||
memcpy(bs->resp, resp, resp_len);
|
||||
|
||||
complete(&bs->comp);
|
||||
bounded_sync_put(bs);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int scoutfs_net_sync_request_timeout(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, void *arg, unsigned arg_len,
|
||||
void *resp, size_t resp_len,
|
||||
unsigned long timeout_jiffies)
|
||||
{
|
||||
struct message_send *msend;
|
||||
struct bounded_sync *bs;
|
||||
int ret;
|
||||
u64 id;
|
||||
|
||||
bs = kzalloc(sizeof(*bs), GFP_NOFS);
|
||||
if (!bs)
|
||||
return -ENOMEM;
|
||||
init_completion(&bs->comp);
|
||||
bs->resp = resp;
|
||||
bs->resp_len = resp_len;
|
||||
bs->error = 0;
|
||||
atomic_set(&bs->refs, 2);
|
||||
|
||||
ret = scoutfs_net_submit_request(sb, conn, cmd, arg, arg_len,
|
||||
bounded_sync_response, bs, &id);
|
||||
if (ret) {
|
||||
bounded_sync_put(bs);
|
||||
bounded_sync_put(bs);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (wait_for_completion_timeout(&bs->comp, timeout_jiffies) == 0) {
|
||||
scoutfs_inc_counter(sb, client_rpc_timeout);
|
||||
|
||||
spin_lock(&conn->lock);
|
||||
msend = find_request(conn, cmd, id);
|
||||
if (msend)
|
||||
queue_dead_free(conn, msend);
|
||||
spin_unlock(&conn->lock);
|
||||
|
||||
if (msend)
|
||||
bounded_sync_put(bs);
|
||||
else
|
||||
wait_for_completion(&bs->comp);
|
||||
ret = -ETIMEDOUT;
|
||||
} else {
|
||||
ret = bs->error;
|
||||
}
|
||||
|
||||
bounded_sync_put(bs);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void net_tseq_show_conn(struct seq_file *m,
|
||||
struct scoutfs_tseq_entry *ent)
|
||||
{
|
||||
|
||||
@@ -150,11 +150,6 @@ int scoutfs_net_sync_request(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, void *arg, unsigned arg_len,
|
||||
void *resp, size_t resp_len);
|
||||
int scoutfs_net_sync_request_timeout(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, void *arg, unsigned arg_len,
|
||||
void *resp, size_t resp_len,
|
||||
unsigned long timeout_jiffies);
|
||||
int scoutfs_net_response(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id, int error, void *resp, u16 resp_len);
|
||||
|
||||
744
kmod/src/raw.c
Normal file
744
kmod/src/raw.c
Normal file
@@ -0,0 +1,744 @@
|
||||
/*
|
||||
* Copyright (C) 2026 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/list_sort.h>
|
||||
#include <linux/sort.h>
|
||||
|
||||
#include "format.h"
|
||||
#include "key.h"
|
||||
#include "block.h"
|
||||
#include "inode.h"
|
||||
#include "forest.h"
|
||||
#include "client.h"
|
||||
#include "ioctl.h"
|
||||
#include "lock.h"
|
||||
#include "xattr.h"
|
||||
#include "attr_x.h"
|
||||
#include "bsearch_index.h"
|
||||
#include "raw.h"
|
||||
|
||||
struct fs_item {
|
||||
struct list_head head;
|
||||
struct scoutfs_key key;
|
||||
u64 seq;
|
||||
int val_len;
|
||||
bool deletion;
|
||||
/* val is aligned so we can deref structs in vals */
|
||||
u8 val[0] __aligned(ARCH_KMALLOC_MINALIGN);
|
||||
};
|
||||
|
||||
static int save_fs_item(struct list_head *list, struct scoutfs_key *key, u64 seq, u8 flags,
|
||||
void *val, int val_len)
|
||||
{
|
||||
struct fs_item *fsi;
|
||||
|
||||
/* max btree val len is hundreds of bytes */
|
||||
fsi = kmalloc(offsetof(struct fs_item, val[val_len]), GFP_NOFS);
|
||||
if (!fsi)
|
||||
return -ENOMEM;
|
||||
|
||||
fsi->key = *key;
|
||||
fsi->seq = seq;
|
||||
fsi->val_len = val_len;
|
||||
fsi->deletion = !!(flags & SCOUTFS_ITEM_FLAG_DELETION);
|
||||
if (val_len > 0)
|
||||
memcpy(fsi->val, val, val_len);
|
||||
list_add_tail(&fsi->head, list);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void free_fs_item(struct fs_item *fsi)
|
||||
{
|
||||
if (!list_empty(&fsi->head))
|
||||
list_del_init(&fsi->head);
|
||||
kfree(fsi);
|
||||
}
|
||||
|
||||
static void free_fs_items(struct list_head *list)
|
||||
{
|
||||
struct fs_item *fsi;
|
||||
struct fs_item *tmp;
|
||||
|
||||
list_for_each_entry_safe(fsi, tmp, list, head)
|
||||
free_fs_item(fsi);
|
||||
}
|
||||
|
||||
static struct fs_item *next_fs_item(struct list_head *list, struct fs_item *fsi)
|
||||
{
|
||||
list_for_each_entry_continue(fsi, list, head)
|
||||
return fsi;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int cmp_fs_items(void *priv, KC_LIST_CMP_CONST struct list_head *A,
|
||||
KC_LIST_CMP_CONST struct list_head *B)
|
||||
{
|
||||
KC_LIST_CMP_CONST struct fs_item *a =
|
||||
container_of(A, KC_LIST_CMP_CONST struct fs_item, head);
|
||||
KC_LIST_CMP_CONST struct fs_item *b =
|
||||
container_of(B, KC_LIST_CMP_CONST struct fs_item, head);
|
||||
|
||||
return scoutfs_key_compare(&a->key, &b->key) ?: -scoutfs_cmp(a->seq, b->seq);
|
||||
}
|
||||
|
||||
static void sort_and_remove(struct list_head *list, struct scoutfs_key *end)
|
||||
{
|
||||
struct fs_item *prev;
|
||||
struct fs_item *fsi;
|
||||
struct fs_item *tmp;
|
||||
|
||||
list_sort(NULL, list, cmp_fs_items);
|
||||
|
||||
/* start by removing any items read before end was decreased by later blocks */
|
||||
list_for_each_entry_safe_reverse(fsi, tmp, list, head) {
|
||||
if (scoutfs_key_compare(&fsi->key, end) > 0)
|
||||
free_fs_item(fsi);
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
prev = NULL;
|
||||
list_for_each_entry_safe(fsi, tmp, list, head) {
|
||||
/* remove this item if it's an older version of previous item */
|
||||
if (prev && scoutfs_key_compare(&prev->key, &fsi->key) == 0) {
|
||||
free_fs_item(fsi);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* remove previous deletion item once it has removed all older versions */
|
||||
if (prev && prev->deletion)
|
||||
free_fs_item(prev);
|
||||
|
||||
/* next item might match this, record to compare */
|
||||
prev = fsi;
|
||||
}
|
||||
|
||||
/* remove the last item if it's a deletion */
|
||||
list_for_each_entry_reverse(fsi, list, head) {
|
||||
if (fsi->deletion)
|
||||
free_fs_item(fsi);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static int save_all_items(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags,
|
||||
void *val, int val_len, int fic, void *arg)
|
||||
{
|
||||
struct list_head *list = arg;
|
||||
|
||||
return save_fs_item(list, key, seq, flags, val, val_len);
|
||||
}
|
||||
|
||||
/* -------------- */
|
||||
|
||||
static void ms_from_key(struct scoutfs_ioctl_meta_seq *ms, struct scoutfs_key *key)
|
||||
{
|
||||
ms->meta_seq = le64_to_cpu(key->skii_major);
|
||||
ms->ino = le64_to_cpu(key->skii_ino);
|
||||
}
|
||||
|
||||
/*
|
||||
* Increment the key's ino->meta_seq so that we don't land between items.
|
||||
*/
|
||||
static void inc_meta_seq(struct scoutfs_key *key)
|
||||
{
|
||||
le64_add_cpu(&key->skii_ino, 1);
|
||||
if (key->skii_ino == 0)
|
||||
le64_add_cpu(&key->skii_major, 1);
|
||||
}
|
||||
|
||||
int scoutfs_raw_read_meta_seq(struct super_block *sb,
|
||||
struct scoutfs_ioctl_raw_read_meta_seq *rms,
|
||||
struct scoutfs_ioctl_meta_seq *last_ret)
|
||||
{
|
||||
struct scoutfs_ioctl_meta_seq __user *ums;
|
||||
struct scoutfs_ioctl_meta_seq ms;
|
||||
struct scoutfs_net_roots roots;
|
||||
DECLARE_SAVED_REFS(saved);
|
||||
struct scoutfs_key start;
|
||||
struct scoutfs_key last;
|
||||
struct scoutfs_key key;
|
||||
struct scoutfs_key end;
|
||||
struct fs_item *fsi;
|
||||
struct fs_item *tmp;
|
||||
LIST_HEAD(list);
|
||||
int retries;
|
||||
int copied;
|
||||
int count;
|
||||
int ret;
|
||||
|
||||
ums = (void __user *)rms->results_ptr;
|
||||
count = rms->results_size / sizeof(struct scoutfs_ioctl_meta_seq);
|
||||
retries = 10;
|
||||
copied = 0;
|
||||
|
||||
scoutfs_inode_init_index_key(&last, SCOUTFS_INODE_INDEX_META_SEQ_TYPE,
|
||||
rms->end.meta_seq, 0, rms->end.ino);
|
||||
|
||||
retry:
|
||||
ret = scoutfs_client_get_roots(sb, &roots);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
scoutfs_inode_init_index_key(&key, SCOUTFS_INODE_INDEX_META_SEQ_TYPE,
|
||||
rms->start.meta_seq, 0, rms->start.ino);
|
||||
|
||||
for (;;) {
|
||||
start = key;
|
||||
end = last;
|
||||
ret = scoutfs_forest_read_items_roots(sb, &roots, 0, &key, NULL, &start, &end,
|
||||
save_all_items, &list);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
sort_and_remove(&list, &end);
|
||||
|
||||
list_for_each_entry_safe(fsi, tmp, &list, head) {
|
||||
|
||||
if (copied == count) {
|
||||
/* results are full, set end to before item can't return */
|
||||
end = fsi->key;
|
||||
le64_add_cpu(&end.skii_ino, -1ULL);
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ms_from_key(&ms, &fsi->key);
|
||||
if (copy_to_user(&ums[copied], &ms, sizeof(ms))) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
free_fs_item(fsi);
|
||||
copied++;
|
||||
}
|
||||
|
||||
if (scoutfs_key_compare(&end, &last) >= 0) {
|
||||
end = last;
|
||||
break;
|
||||
}
|
||||
|
||||
key = end;
|
||||
inc_meta_seq(&key);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
free_fs_items(&list);
|
||||
|
||||
ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.fs_root.ref, &roots.logs_root.ref);
|
||||
if (ret == -ESTALE && copied == 0 && retries-- > 0)
|
||||
goto retry;
|
||||
|
||||
ms_from_key(last_ret, &end);
|
||||
|
||||
return ret ?: copied;
|
||||
}
|
||||
|
||||
/* -------------- */
|
||||
|
||||
struct inode_info_context {
|
||||
size_t nr_inos;
|
||||
u64 *inos;
|
||||
|
||||
size_t nr_names;
|
||||
struct xattr_name {
|
||||
u64 hash;
|
||||
char *name;
|
||||
u8 name_len; /* no null */
|
||||
} *names;
|
||||
|
||||
struct list_head fs_items;
|
||||
};
|
||||
|
||||
static int cmp_u64(const void *A, const void *B)
|
||||
{
|
||||
const u64 *a = A;
|
||||
const u64 *b = B;
|
||||
|
||||
return scoutfs_cmp(*a, *b);
|
||||
}
|
||||
|
||||
static int cmp_name_hash(const void *A, const void *B)
|
||||
{
|
||||
const struct xattr_name *a = A;
|
||||
const struct xattr_name *b = B;
|
||||
|
||||
return scoutfs_cmp(a->hash, b->hash);
|
||||
}
|
||||
|
||||
static int cmp_name_string(const void *A, const void *B)
|
||||
{
|
||||
const struct xattr_name *a = A;
|
||||
const struct xattr_name *b = B;
|
||||
|
||||
return scoutfs_cmp(a->name_len, b->name_len) ?: memcmp(a->name, b->name, a->name_len);
|
||||
}
|
||||
|
||||
static int setup_context(struct inode_info_context *ctx,
|
||||
struct scoutfs_ioctl_raw_read_inode_info *rii)
|
||||
{
|
||||
__u64 __user *uinos = (void __user *)rii->inos_ptr;
|
||||
char __user *uname;
|
||||
long len_null;
|
||||
long len;
|
||||
int ret;
|
||||
u32 i;
|
||||
|
||||
ctx->nr_inos = rii->inos_count;
|
||||
ctx->nr_names = rii->names_count;
|
||||
INIT_LIST_HEAD(&ctx->fs_items);
|
||||
|
||||
ctx->inos = kvmalloc_array(ctx->nr_inos, sizeof(ctx->inos[0]), GFP_KERNEL);
|
||||
ctx->names = kvcalloc(ctx->nr_names, sizeof(ctx->names[0]), GFP_KERNEL);
|
||||
if (!ctx->inos || !ctx->names) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (copy_from_user(ctx->inos, uinos, ctx->nr_inos * sizeof(ctx->inos[0]))) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* inos must not be 0 and must increase and contain no duplicates */
|
||||
if (ctx->inos[0] == 0) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
for (i = 1; i < ctx->nr_inos; i++) {
|
||||
if (ctx->inos[i] <= ctx->inos[i - 1]) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
uname = (void __user *)rii->names_ptr;
|
||||
for (i = 0; i < ctx->nr_names; i++) {
|
||||
len_null = SCOUTFS_XATTR_MAX_NAME_LEN + 1;
|
||||
ret = strnlen_user(uname, len_null);
|
||||
if (ret <= 1 || ret > len_null) {
|
||||
if (ret >= 0)
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
len_null = ret;
|
||||
len = len_null - 1;
|
||||
|
||||
ctx->names[i].name_len = len;
|
||||
ctx->names[i].name = kmalloc(len_null, GFP_KERNEL);
|
||||
if (!ctx->names[i].name) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = strncpy_from_user(ctx->names[i].name, uname, len_null);
|
||||
if (ret != len) {
|
||||
if (ret >= 0)
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ctx->names[i].hash = scoutfs_xattr_name_hash(ctx->names[i].name, len);
|
||||
uname += len_null;
|
||||
}
|
||||
|
||||
/* make sure all the names differ */
|
||||
sort(ctx->names, ctx->nr_names, sizeof(ctx->names[0]), cmp_name_string, NULL);
|
||||
for (i = 1; i < ctx->nr_names; i++) {
|
||||
if (cmp_name_string(&ctx->names[i - 1], &ctx->names[i]) == 0) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* then leave them sorted by hash */
|
||||
sort(ctx->names, ctx->nr_names, sizeof(ctx->names[0]), cmp_name_hash, NULL);
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void free_context(struct inode_info_context *ctx)
|
||||
{
|
||||
int i;
|
||||
|
||||
kvfree(ctx->inos);
|
||||
|
||||
if (ctx->names) {
|
||||
for (i = 0; i < ctx->nr_names; i++) {
|
||||
if (!ctx->names[i].name)
|
||||
break;
|
||||
kfree(ctx->names[i].name);
|
||||
}
|
||||
kvfree(ctx->names);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate over fs items and save any that we're interested in. We want
|
||||
* inode struct items and any xattr items whose hashes collide with the
|
||||
* xattr names we're searching for.
|
||||
*
|
||||
* Our forest calls can be advancing through the key space as we see
|
||||
* slices that intersect with blocks in trees. And each forest caller
|
||||
* can be resetting the key position to the start of each forest block
|
||||
* it reads in an intersection.
|
||||
*
|
||||
* From this callback's perspective, the key can be jumping all over the
|
||||
* place. We don't have any iterative position state. For each key we
|
||||
* decide if we want to save it and then set the key to the next key we
|
||||
* want after the current key. We'll combine all the saved keys later.
|
||||
*/
|
||||
static int save_info_items(struct super_block *sb, struct scoutfs_key *key, u64 seq,
|
||||
u8 flags, void *val, int val_len, int fic, void *arg)
|
||||
{
|
||||
u64 ino = le64_to_cpu(key->_sk_first);
|
||||
struct inode_info_context *ctx = arg;
|
||||
struct xattr_name name;
|
||||
size_t name_ind;
|
||||
size_t ino_ind;
|
||||
bool hash_match;
|
||||
bool ino_match;
|
||||
int ret;
|
||||
|
||||
ino_ind = bsearch_index(&ino, ctx->inos, ctx->nr_inos, sizeof(ctx->inos[0]), cmp_u64);
|
||||
ino_match = ino_ind < ctx->nr_inos && ctx->inos[ino_ind] == ino;
|
||||
|
||||
/* jump to to next ino, could be for this key if we're before the ino struct */
|
||||
if (!ino_match || key->sk_type < SCOUTFS_INODE_TYPE)
|
||||
goto next_inode;
|
||||
|
||||
/* find our search position in xattrs */
|
||||
if (key->sk_type < SCOUTFS_XATTR_TYPE) {
|
||||
name_ind = 0;
|
||||
hash_match = false;
|
||||
|
||||
} else if (key->sk_type == SCOUTFS_XATTR_TYPE) {
|
||||
name = (struct xattr_name) { .hash = le64_to_cpu(key->skx_name_hash) };
|
||||
name_ind = bsearch_index(&name, ctx->names, ctx->nr_names, sizeof(ctx->names[0]),
|
||||
cmp_name_hash);
|
||||
hash_match = name_ind < ctx->nr_names && ctx->names[name_ind].hash == name.hash;
|
||||
} else {
|
||||
name_ind = ctx->nr_names;
|
||||
hash_match = false;
|
||||
}
|
||||
|
||||
/* save inode items for our search and all xattr items that match search hashes */
|
||||
if (key->sk_type == SCOUTFS_INODE_TYPE || hash_match) {
|
||||
ret = save_fs_item(&ctx->fs_items, key, seq, flags, val, val_len);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* let the caller continue iterating through matching xattr items */
|
||||
if (hash_match) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* jump to the next xattr */
|
||||
if (name_ind < ctx->nr_names) {
|
||||
scoutfs_xattr_init_key(key, ino, ctx->names[name_ind].hash, 0);
|
||||
ret = -ESRCH;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* no more xattrs, must be done with this ino */
|
||||
ino_ind++;
|
||||
|
||||
next_inode:
|
||||
/* now jump to next inode struct key, or we're done */
|
||||
if (ino_ind < ctx->nr_inos)
|
||||
scoutfs_inode_init_key(key, ctx->inos[ino_ind]);
|
||||
else
|
||||
scoutfs_key_set_ones(key);
|
||||
|
||||
ret = -ESRCH;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int copy_to_user_off(void __user *dst, size_t *dst_off, size_t dst_size,
|
||||
void *src, size_t copy_size)
|
||||
{
|
||||
if (copy_size == 0)
|
||||
return 0;
|
||||
if (*dst_off + copy_size > dst_size)
|
||||
return -ERANGE;
|
||||
if (copy_to_user(dst + *dst_off, src, copy_size))
|
||||
return -EFAULT;
|
||||
|
||||
*dst_off += copy_size;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int copy_result_to_user(void __user *ures, size_t *off, size_t size, u8 type,
|
||||
void *a_data, size_t a_len, void *b_data, size_t b_len,
|
||||
size_t extra_size)
|
||||
{
|
||||
struct scoutfs_ioctl_raw_read_result res;
|
||||
const size_t szof_res = sizeof(struct scoutfs_ioctl_raw_read_result);
|
||||
|
||||
memzero_explicit(&res, szof_res);
|
||||
res = (struct scoutfs_ioctl_raw_read_result) {
|
||||
.size = a_len + b_len + extra_size,
|
||||
.type = type,
|
||||
};
|
||||
|
||||
return copy_to_user_off(ures, off, size, &res, szof_res) ?:
|
||||
(a_len ? copy_to_user_off(ures, off, size, a_data, a_len) : 0) ?:
|
||||
(b_len ? copy_to_user_off(ures, off, size, b_data, b_len) : 0);
|
||||
}
|
||||
|
||||
static int copy_item_results_to_user(struct super_block *sb, struct inode_info_context *ctx,
|
||||
void __user *ures, size_t *off, size_t size,
|
||||
struct fs_item *fsi)
|
||||
{
|
||||
struct scoutfs_inode *cinode;
|
||||
struct scoutfs_xattr *xat;
|
||||
static char null = '\0';
|
||||
size_t len;
|
||||
u64 ino;
|
||||
int ret = 0;
|
||||
|
||||
if (fsi->key.sk_type == SCOUTFS_INODE_TYPE) {
|
||||
cinode = (void *)fsi->val;
|
||||
ino = le64_to_cpu(fsi->key.ski_ino);
|
||||
|
||||
ret = copy_result_to_user(ures, off, size, SCOUTFS_IOC_RAW_READ_RESULT_INODE,
|
||||
&ino, sizeof(ino), cinode, sizeof(struct scoutfs_inode),
|
||||
0);
|
||||
|
||||
} else if (fsi->key.sk_type == SCOUTFS_XATTR_TYPE) {
|
||||
if (fsi->key.skx_part == 0) {
|
||||
xat = (void *)fsi->val;
|
||||
ret = copy_result_to_user(ures, off, size,
|
||||
SCOUTFS_IOC_RAW_READ_RESULT_XATTR, xat->name,
|
||||
xat->name_len, &null, sizeof(null),
|
||||
le16_to_cpu(xat->val_len));
|
||||
if (ret == 0 && xat->val_len != 0) {
|
||||
/* then append the start of the value */
|
||||
len = fsi->val_len -
|
||||
offsetof(struct scoutfs_xattr, name[xat->name_len]);
|
||||
ret = copy_to_user_off(ures, off, size, xat->name + xat->name_len,
|
||||
len);
|
||||
}
|
||||
} else {
|
||||
/* continue appending partial values */
|
||||
ret = copy_to_user_off(ures, off, size, fsi->val, fsi->val_len);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool ignore_zero_nlink(struct inode_info_context *ctx, struct fs_item *fsi)
|
||||
{
|
||||
struct scoutfs_inode *cinode = (void *)fsi->val;
|
||||
|
||||
return cinode->nlink == 0;
|
||||
}
|
||||
|
||||
static bool ignore_xattr_name(struct inode_info_context *ctx, struct fs_item *fsi)
|
||||
{
|
||||
struct scoutfs_xattr *xat = (void *)fsi->val;
|
||||
struct xattr_name name = {
|
||||
.hash = le64_to_cpu(fsi->key.skx_name_hash),
|
||||
.name = xat->name,
|
||||
.name_len = xat->name_len,
|
||||
};
|
||||
size_t i;
|
||||
|
||||
for (i = bsearch_index(&name, ctx->names, ctx->nr_names, sizeof(ctx->names[0]),
|
||||
cmp_name_hash);
|
||||
i < ctx->nr_names && name.hash == ctx->names[i].hash; i++) {
|
||||
if (cmp_name_string(&name, &ctx->names[i]) == 0)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int copy_results_to_user(struct super_block *sb, struct inode_info_context *ctx,
|
||||
struct scoutfs_ioctl_raw_read_inode_info *rii)
|
||||
{
|
||||
void __user *ures = (void __user *)rii->results_ptr;
|
||||
struct scoutfs_xattr *xat;
|
||||
struct fs_item *next;
|
||||
struct fs_item *fsi;
|
||||
struct fs_item *tmp;
|
||||
size_t xattr_end;
|
||||
size_t off;
|
||||
__le64 in_ino;
|
||||
__le64 in_id;
|
||||
int ret;
|
||||
|
||||
in_ino = 0;
|
||||
xattr_end = 0;
|
||||
in_id = 0;
|
||||
off = 0;
|
||||
|
||||
list_for_each_entry_safe(fsi, tmp, &ctx->fs_items, head) {
|
||||
/*
|
||||
* ignore:
|
||||
* - inodes with an nlink of 0
|
||||
* - all items for an ino after the inode struct that we're ignoring
|
||||
* - first xattr parts with a name we don't need
|
||||
* - additional xattr parts when we ignored the first
|
||||
*/
|
||||
if ((fsi->key.sk_type == SCOUTFS_INODE_TYPE && ignore_zero_nlink(ctx, fsi)) ||
|
||||
(fsi->key.sk_type > SCOUTFS_INODE_TYPE && fsi->key._sk_first != in_ino) ||
|
||||
(fsi->key.sk_type == SCOUTFS_XATTR_TYPE &&
|
||||
((fsi->key.skx_part == 0 && ignore_xattr_name(ctx, fsi)) ||
|
||||
(fsi->key.skx_part > 0 && fsi->key.skx_id != in_id)))) {
|
||||
free_fs_item(fsi);
|
||||
in_ino = 0;
|
||||
in_id = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
/* advance ino/xattr stream context state machine */
|
||||
if (fsi->key.sk_type == SCOUTFS_INODE_TYPE) {
|
||||
in_ino = fsi->key.ski_ino;
|
||||
in_id = 0;
|
||||
} else if (fsi->key.sk_type == SCOUTFS_XATTR_TYPE && fsi->key.skx_part == 0) {
|
||||
in_id = fsi->key.skx_id;
|
||||
/* save the required offset after the complete xattr */
|
||||
xat = (void *)fsi->val;
|
||||
xattr_end = off + sizeof(struct scoutfs_ioctl_raw_read_result) +
|
||||
xat->name_len + 1 + le16_to_cpu(xat->val_len);
|
||||
}
|
||||
|
||||
/* copy results, usually with header, but additional xattr parts copied raw */
|
||||
ret = copy_item_results_to_user(sb, ctx, ures, &off, rii->results_size, fsi);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* make sure we saw all xattr parts and copied the correct size */
|
||||
if (xattr_end > 0 &&
|
||||
!((next = next_fs_item(&ctx->fs_items, fsi)) &&
|
||||
next->key.sk_type == SCOUTFS_XATTR_TYPE && next->key.skx_ino == in_ino &&
|
||||
next->key.skx_id == in_id)) {
|
||||
if (off != xattr_end) {
|
||||
ret = -EUCLEAN;
|
||||
goto out;
|
||||
}
|
||||
xattr_end = 0;
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret ?: off;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the key is for an inode we're not interested in, or if its past
|
||||
* the xattr items, then advance to the next inode. This is used
|
||||
* between forest read items calls to avoid leaf blocks. The callback
|
||||
* takes care of iterating through the items for an inode across
|
||||
* multiple leaves.
|
||||
*/
|
||||
static void advance_key_ino(struct scoutfs_key *key, struct inode_info_context *ctx)
|
||||
{
|
||||
u64 ino = le64_to_cpu(key->_sk_first);
|
||||
size_t ino_ind;
|
||||
|
||||
ino_ind = bsearch_index(&ino, ctx->inos, ctx->nr_inos, sizeof(ctx->inos[0]), cmp_u64);
|
||||
if (ino_ind < ctx->nr_inos && ctx->inos[ino_ind] == ino) {
|
||||
if (key->sk_type <= SCOUTFS_XATTR_TYPE)
|
||||
return;
|
||||
else
|
||||
ino_ind++;
|
||||
}
|
||||
|
||||
if (ino_ind < ctx->nr_inos)
|
||||
scoutfs_inode_init_key(key, ctx->inos[ino_ind]);
|
||||
else
|
||||
scoutfs_key_set_ones(key);
|
||||
}
|
||||
|
||||
int scoutfs_raw_read_inode_info(struct super_block *sb,
|
||||
struct scoutfs_ioctl_raw_read_inode_info *rii)
|
||||
{
|
||||
struct inode_info_context ctx = {0, };
|
||||
struct scoutfs_net_roots roots;
|
||||
DECLARE_SAVED_REFS(saved);
|
||||
struct scoutfs_key lock_start;
|
||||
struct scoutfs_key lock_end;
|
||||
struct scoutfs_key start;
|
||||
struct scoutfs_key last;
|
||||
struct scoutfs_key key;
|
||||
struct scoutfs_key end;
|
||||
LIST_HEAD(list);
|
||||
int retries = 10;
|
||||
int ret;
|
||||
|
||||
ret = setup_context(&ctx, rii);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (ctx.nr_names > 0)
|
||||
scoutfs_xattr_init_key(&last, ctx.inos[ctx.nr_inos -1],
|
||||
ctx.names[ctx.nr_names - 1].hash, U64_MAX);
|
||||
else
|
||||
scoutfs_inode_init_key(&last, ctx.inos[ctx.nr_inos - 1]);
|
||||
|
||||
retry:
|
||||
ret = scoutfs_client_get_roots(sb, &roots);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
scoutfs_inode_init_key(&key, ctx.inos[0]);
|
||||
|
||||
while (scoutfs_key_compare(&key, &last) <= 0) {
|
||||
scoutfs_lock_get_fs_item_range(le64_to_cpu(key._sk_first), &lock_start, &lock_end);
|
||||
|
||||
start = key;
|
||||
end = last;
|
||||
if (scoutfs_key_compare(&lock_end, &end) < 0)
|
||||
end = lock_end;
|
||||
|
||||
ret = scoutfs_forest_read_items_roots(sb, &roots, 0, &key, &lock_start,
|
||||
&start, &end, save_info_items, &ctx);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
/* save each sorted batch, might have partial results for an inode */
|
||||
sort_and_remove(&ctx.fs_items, &end);
|
||||
list_splice_tail_init(&ctx.fs_items, &list);
|
||||
|
||||
key = end;
|
||||
if (!scoutfs_key_is_ones(&key)) {
|
||||
scoutfs_key_inc(&key);
|
||||
advance_key_ino(&key, &ctx);
|
||||
}
|
||||
}
|
||||
|
||||
list_splice_tail_init(&list, &ctx.fs_items);
|
||||
ret = copy_results_to_user(sb, &ctx, rii);
|
||||
out:
|
||||
free_fs_items(&list);
|
||||
free_fs_items(&ctx.fs_items);
|
||||
|
||||
ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.fs_root.ref, &roots.logs_root.ref);
|
||||
if (ret == -ESTALE && retries-- > 0)
|
||||
goto retry;
|
||||
|
||||
free_context(&ctx);
|
||||
return ret;
|
||||
}
|
||||
10
kmod/src/raw.h
Normal file
10
kmod/src/raw.h
Normal file
@@ -0,0 +1,10 @@
|
||||
#ifndef _SCOUTFS_RAW_H_
|
||||
#define _SCOUTFS_RAW_H_
|
||||
|
||||
int scoutfs_raw_read_meta_seq(struct super_block *sb,
|
||||
struct scoutfs_ioctl_raw_read_meta_seq *rms,
|
||||
struct scoutfs_ioctl_meta_seq *last_ret);
|
||||
int scoutfs_raw_read_inode_info(struct super_block *sb,
|
||||
struct scoutfs_ioctl_raw_read_inode_info *rii);
|
||||
|
||||
#endif
|
||||
@@ -638,7 +638,7 @@ static void scoutfs_server_commit_func(struct work_struct *work)
|
||||
ret = scoutfs_alloc_empty_list(sb, &server->alloc, &server->wri,
|
||||
server->meta_freed,
|
||||
server->other_freed);
|
||||
if (ret && ret != -ENOLINK) {
|
||||
if (ret) {
|
||||
scoutfs_err(sb, "server error emptying freed: %d", ret);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -95,13 +95,6 @@ struct srch_info {
|
||||
*/
|
||||
#define SRCH_COMPACT_DIRTY_LIMIT_BYTES (32 * 1024 * 1024)
|
||||
|
||||
/*
|
||||
* Generous per-RPC bound for the idempotent compact worker. A server
|
||||
* that hasn't answered in this long is assumed to be broken; dropping
|
||||
* the request lets the worker reschedule instead of blocking forever.
|
||||
*/
|
||||
#define COMPACT_RPC_TIMEOUT (5 * 60 * HZ)
|
||||
|
||||
static int sre_cmp(const struct scoutfs_srch_entry *a,
|
||||
const struct scoutfs_srch_entry *b)
|
||||
{
|
||||
@@ -2263,8 +2256,7 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
|
||||
|
||||
scoutfs_block_writer_init(sb, &wri);
|
||||
|
||||
ret = scoutfs_client_srch_get_compact_timeout(sb, sc,
|
||||
COMPACT_RPC_TIMEOUT);
|
||||
ret = scoutfs_client_srch_get_compact(sb, sc);
|
||||
if (ret >= 0)
|
||||
trace_scoutfs_srch_compact_client_recv(sb, sc);
|
||||
if (ret < 0 || sc->nr == 0)
|
||||
@@ -2295,8 +2287,7 @@ static void scoutfs_srch_compact_worker(struct work_struct *work)
|
||||
sc->flags |= ret < 0 ? SCOUTFS_SRCH_COMPACT_FLAG_ERROR : 0;
|
||||
|
||||
trace_scoutfs_srch_compact_client_send(sb, sc);
|
||||
err = scoutfs_client_srch_commit_compact_timeout(sb, sc,
|
||||
COMPACT_RPC_TIMEOUT);
|
||||
err = scoutfs_client_srch_commit_compact(sb, sc);
|
||||
if (err < 0 && ret == 0)
|
||||
ret = err;
|
||||
out:
|
||||
|
||||
@@ -195,8 +195,7 @@ static int retry_forever(struct super_block *sb, int (*func)(struct super_block
|
||||
retrying = true;
|
||||
}
|
||||
|
||||
if (scoutfs_forcing_unmount(sb) ||
|
||||
scoutfs_unmounting(sb)) {
|
||||
if (scoutfs_forcing_unmount(sb)) {
|
||||
ret = -ENOLINK;
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -47,7 +47,7 @@
|
||||
* - add acl support and call generic xattr->handlers for SYSTEM
|
||||
*/
|
||||
|
||||
static u32 xattr_name_hash(const char *name, unsigned int name_len)
|
||||
u32 scoutfs_xattr_name_hash(const char *name, unsigned int name_len)
|
||||
{
|
||||
return crc32c(U32_MAX, name, name_len);
|
||||
}
|
||||
@@ -65,8 +65,7 @@ static unsigned int xattr_nr_parts(struct scoutfs_xattr *xat)
|
||||
le16_to_cpu(xat->val_len));
|
||||
}
|
||||
|
||||
static void init_xattr_key(struct scoutfs_key *key, u64 ino, u32 name_hash,
|
||||
u64 id)
|
||||
void scoutfs_xattr_init_key(struct scoutfs_key *key, u64 ino, u32 name_hash, u64 id)
|
||||
{
|
||||
*key = (struct scoutfs_key) {
|
||||
.sk_zone = SCOUTFS_FS_ZONE,
|
||||
@@ -187,10 +186,10 @@ static int get_next_xattr(struct inode *inode, struct scoutfs_key *key,
|
||||
return -EINVAL;
|
||||
|
||||
if (name_len)
|
||||
name_hash = xattr_name_hash(name, name_len);
|
||||
name_hash = scoutfs_xattr_name_hash(name, name_len);
|
||||
|
||||
init_xattr_key(key, scoutfs_ino(inode), name_hash, id);
|
||||
init_xattr_key(&last, scoutfs_ino(inode), U32_MAX, U64_MAX);
|
||||
scoutfs_xattr_init_key(key, scoutfs_ino(inode), name_hash, id);
|
||||
scoutfs_xattr_init_key(&last, scoutfs_ino(inode), U32_MAX, U64_MAX);
|
||||
|
||||
for (;;) {
|
||||
ret = scoutfs_item_next(sb, key, &last, xat, xat_bytes, lock);
|
||||
@@ -335,8 +334,8 @@ static int create_xattr_items(struct inode *inode, u64 id, struct scoutfs_xattr
|
||||
int len;
|
||||
int i;
|
||||
|
||||
init_xattr_key(&key, scoutfs_ino(inode),
|
||||
xattr_name_hash(xat->name, xat->name_len), id);
|
||||
scoutfs_xattr_init_key(&key, scoutfs_ino(inode),
|
||||
scoutfs_xattr_name_hash(xat->name, xat->name_len), id);
|
||||
|
||||
for (i = 0; i < new_parts; i++) {
|
||||
key.skx_part = i;
|
||||
@@ -365,7 +364,7 @@ static int delete_xattr_items(struct inode *inode, u32 name_hash, u64 id,
|
||||
int ret = 0;
|
||||
int i;
|
||||
|
||||
init_xattr_key(&key, scoutfs_ino(inode), name_hash, id);
|
||||
scoutfs_xattr_init_key(&key, scoutfs_ino(inode), name_hash, id);
|
||||
|
||||
/* dirty additional existing old items */
|
||||
for (i = 1; i < nr_parts; i++) {
|
||||
@@ -407,8 +406,8 @@ static int change_xattr_items(struct inode *inode, u64 id,
|
||||
int i;
|
||||
int ret;
|
||||
|
||||
init_xattr_key(&key, scoutfs_ino(inode),
|
||||
xattr_name_hash(xat->name, xat->name_len), id);
|
||||
scoutfs_xattr_init_key(&key, scoutfs_ino(inode),
|
||||
scoutfs_xattr_name_hash(xat->name, xat->name_len), id);
|
||||
|
||||
/* dirty existing old items */
|
||||
for (i = 0; i < old_parts; i++) {
|
||||
@@ -1224,8 +1223,8 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino,
|
||||
goto out;
|
||||
}
|
||||
|
||||
init_xattr_key(&key, ino, 0, 0);
|
||||
init_xattr_key(&last, ino, U32_MAX, U64_MAX);
|
||||
scoutfs_xattr_init_key(&key, ino, 0, 0);
|
||||
scoutfs_xattr_init_key(&last, ino, U32_MAX, U64_MAX);
|
||||
|
||||
for (;;) {
|
||||
ret = scoutfs_item_next(sb, &key, &last, (void *)xat, bytes,
|
||||
|
||||
@@ -10,6 +10,9 @@ struct scoutfs_xattr_prefix_tags {
|
||||
|
||||
extern const struct xattr_handler *scoutfs_xattr_handlers[];
|
||||
|
||||
u32 scoutfs_xattr_name_hash(const char *name, unsigned int name_len);
|
||||
void scoutfs_xattr_init_key(struct scoutfs_key *key, u64 ino, u32 name_hash, u64 id);
|
||||
|
||||
int scoutfs_xattr_get_locked(struct inode *inode, const char *name, void *buffer, size_t size,
|
||||
struct scoutfs_lock *lck);
|
||||
int scoutfs_xattr_set_locked(struct inode *inode, const char *name, size_t name_len,
|
||||
|
||||
1
tests/.gitignore
vendored
1
tests/.gitignore
vendored
@@ -12,3 +12,4 @@ src/o_tmpfile_umask
|
||||
src/o_tmpfile_linkat
|
||||
src/mmap_stress
|
||||
src/mmap_validate
|
||||
src/watch_raw_inode_change
|
||||
|
||||
@@ -15,7 +15,8 @@ BIN := src/createmany \
|
||||
src/o_tmpfile_umask \
|
||||
src/o_tmpfile_linkat \
|
||||
src/mmap_stress \
|
||||
src/mmap_validate
|
||||
src/mmap_validate \
|
||||
src/watch_raw_inode_change
|
||||
|
||||
DEPS := $(wildcard src/*.d)
|
||||
|
||||
|
||||
664
tests/src/watch_raw_inode_change.c
Normal file
664
tests/src/watch_raw_inode_change.c
Normal file
@@ -0,0 +1,664 @@
|
||||
/*
|
||||
* Copyright (C) 2026 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <time.h>
|
||||
#include <linux/types.h>
|
||||
#include <assert.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "../../utils/src/util.h"
|
||||
#include "ioctl.h"
|
||||
#include "format.h"
|
||||
|
||||
/*
|
||||
* This is a quick example of using the raw reading ioctls to get info
|
||||
* on inodes as they change. We maintain an array of meta_seq items for
|
||||
* inodes that we've seen. If we read the current meta_seq items and
|
||||
* see differences then we get inode info and update our array with what
|
||||
* we find.
|
||||
*
|
||||
* This only maintains one array and sorts it back and forth as we walk
|
||||
* the meta_seq items and then search by inode number. This will
|
||||
* eventually use far too much cpu as the number of inodes increases.
|
||||
*/
|
||||
|
||||
#define MSF "%llu.%llu"
|
||||
#define MSA(ms) (ms)->meta_seq, (ms)->ino
|
||||
#define NERRF "nerr %d (\"%s\")"
|
||||
#define NERRA(nerr) nerr, strerror(-nerr)
|
||||
|
||||
#define prerror(fmt, args...) \
|
||||
fprintf(stderr, "error: "fmt"\n", ##args)
|
||||
|
||||
#define prdebug(fmt, args...) \
|
||||
do { \
|
||||
if (opts.debug) \
|
||||
printf(fmt"\n", ##args); \
|
||||
} while (0)
|
||||
|
||||
static struct opts {
|
||||
bool debug;
|
||||
char *path;
|
||||
char *names;
|
||||
size_t names_size;
|
||||
size_t names_count;
|
||||
} opts;
|
||||
|
||||
struct stats {
|
||||
__u64 start;
|
||||
__u64 last;
|
||||
|
||||
struct per_call {
|
||||
__u64 begin;
|
||||
__u64 calls;
|
||||
__u64 time;
|
||||
__u64 inos;
|
||||
} rms, rii;
|
||||
|
||||
__u64 inodes;
|
||||
__u64 add;
|
||||
__u64 remove;
|
||||
__u64 update;
|
||||
|
||||
unsigned lines;
|
||||
} stats;
|
||||
|
||||
struct meta_seq_array {
|
||||
size_t nr;
|
||||
size_t alloc;
|
||||
struct scoutfs_ioctl_meta_seq *ms;
|
||||
};
|
||||
|
||||
#define INO_BATCH 1000
|
||||
/* *2 for gratuitous allowance for struct expansion */
|
||||
#define RESULTS_SIZE (INO_BATCH * 2 * (sizeof(struct scoutfs_ioctl_raw_read_result) + \
|
||||
sizeof(__u64) + \
|
||||
180 /* ~= sizeof(struct scoutfs_inode) */ + \
|
||||
sizeof(struct scoutfs_ioctl_inode_attr_x)))
|
||||
|
||||
#define NSEC_PER_SEC 1000000000
|
||||
|
||||
static __u64 get_ns(void)
|
||||
{
|
||||
struct timespec tp;
|
||||
int ret;
|
||||
|
||||
ret = clock_gettime(CLOCK_MONOTONIC, &tp);
|
||||
if (ret != 0) {
|
||||
ret = -errno;
|
||||
prerror("clock_gettime() error: "NERRF, NERRA(ret));
|
||||
exit(2);
|
||||
}
|
||||
|
||||
return ((__u64)tp.tv_sec * NSEC_PER_SEC) + (__u64)tp.tv_nsec;
|
||||
}
|
||||
static void begin_call(struct per_call *pc)
|
||||
{
|
||||
pc->begin = get_ns();
|
||||
}
|
||||
|
||||
static void end_call(struct per_call *pc)
|
||||
{
|
||||
pc->calls++;
|
||||
pc->time += get_ns() - pc->begin;
|
||||
}
|
||||
|
||||
static int expand_array(struct meta_seq_array *arr, size_t additional)
|
||||
{
|
||||
#define ALLOC_BATCH (1024 * 1024 / (sizeof(struct scoutfs_ioctl_meta_seq)))
|
||||
struct scoutfs_ioctl_meta_seq *ms;
|
||||
size_t expand;
|
||||
|
||||
if (arr->nr + additional <= arr->alloc)
|
||||
return 0;
|
||||
|
||||
expand = arr->alloc + ALLOC_BATCH;
|
||||
ms = reallocarray(arr->ms, expand, sizeof(arr->ms[0]));
|
||||
if (!ms) {
|
||||
prerror("allocating ms array with %zu elements failed", expand);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
arr->alloc = expand;
|
||||
arr->ms = ms;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void inc_ms(struct scoutfs_ioctl_meta_seq *ms)
|
||||
{
|
||||
if (++ms->ino == 0)
|
||||
ms->meta_seq++;
|
||||
}
|
||||
|
||||
static void set_ms(struct scoutfs_ioctl_meta_seq *ms, __u64 meta_seq, __u64 ino)
|
||||
{
|
||||
ms->meta_seq = meta_seq;
|
||||
ms->ino = ino;
|
||||
}
|
||||
|
||||
static int compar_ms_ino(const void *A, const void *B)
|
||||
{
|
||||
const struct scoutfs_ioctl_meta_seq *a = A;
|
||||
const struct scoutfs_ioctl_meta_seq *b = B;
|
||||
|
||||
return a->ino < b->ino ? -1 : a->ino > b->ino ? 1 : 0;
|
||||
}
|
||||
|
||||
static int compar_ms_meta_seq(const void *A, const void *B)
|
||||
{
|
||||
const struct scoutfs_ioctl_meta_seq *a = A;
|
||||
const struct scoutfs_ioctl_meta_seq *b = B;
|
||||
|
||||
return a->meta_seq < b->meta_seq ? -1 : a->meta_seq > b->meta_seq ? 1 :
|
||||
compar_ms_ino(A, B);
|
||||
}
|
||||
|
||||
static int compar_u64(const void *A, const void *B)
|
||||
{
|
||||
const __u64 *a = A;
|
||||
const __u64 *b = B;
|
||||
|
||||
return *a < *b ? -1 : *a > *b ? 1 : 0;
|
||||
}
|
||||
|
||||
struct bsearch_ind_key {
|
||||
int (*compar)(const void *a, const void *b);
|
||||
void *key;
|
||||
size_t size;
|
||||
void **index;
|
||||
};
|
||||
|
||||
static int bsearch_ind_compar(const void *a, const void *b)
|
||||
{
|
||||
const struct bsearch_ind_key *bik = (const void *)((unsigned long)a ^ 1);
|
||||
int cmp;
|
||||
|
||||
/* this key hack only works if compar is always called where a is key and b is &base[..] */
|
||||
assert((unsigned long)a & 1);
|
||||
assert(!((unsigned long)b & 1));
|
||||
|
||||
cmp = bik->compar(bik->key, b);
|
||||
if (cmp > 0)
|
||||
*(bik->index) = (void *)b + bik->size;
|
||||
else
|
||||
*(bik->index) = (void *)b;
|
||||
|
||||
return cmp;
|
||||
}
|
||||
|
||||
static size_t bsearch_ind(const void *key, const void *base, size_t nmemb, size_t size,
|
||||
int (*compar)(const void *a, const void *b))
|
||||
{
|
||||
void *index = (void *)base;
|
||||
struct bsearch_ind_key bik = {
|
||||
.compar = compar,
|
||||
.key = (void *)key,
|
||||
.size = size,
|
||||
.index = &index,
|
||||
};
|
||||
|
||||
bsearch((void *)(((unsigned long)&bik) | 1), base, nmemb, size, bsearch_ind_compar);
|
||||
|
||||
return (index - base) / size;
|
||||
}
|
||||
|
||||
/*
|
||||
* Generate a sorted list of inode numbers for the meta_seq items that
|
||||
* differ between the results from raw_read_meta_seq and the items we
|
||||
* have saved in our array.
|
||||
*/
|
||||
static int differing_inos(__u64 *inos, struct meta_seq_array *arr,
|
||||
struct scoutfs_ioctl_meta_seq *start,
|
||||
struct scoutfs_ioctl_meta_seq *last,
|
||||
struct scoutfs_ioctl_meta_seq *ms, size_t nr)
|
||||
{
|
||||
size_t arr_last;
|
||||
size_t a;
|
||||
size_t m;
|
||||
int nr_inos;
|
||||
int cmp;
|
||||
int i;
|
||||
int n;
|
||||
|
||||
/* find where we're going to stop in arr */
|
||||
arr_last = bsearch_ind(last, arr->ms, arr->nr, sizeof(arr->ms[0]), compar_ms_meta_seq);
|
||||
if (arr_last < arr->nr && compar_ms_meta_seq(&arr->ms[arr_last], last) == 0)
|
||||
arr_last++;
|
||||
|
||||
a = bsearch_ind(start, arr->ms, arr->nr, sizeof(arr->ms[0]), compar_ms_meta_seq);
|
||||
|
||||
for (m = 0, nr_inos = 0; (a < arr_last || m < nr) && nr_inos < INO_BATCH; ) {
|
||||
|
||||
prdebug("diffing: m %zu nr %zu | a %zu arr_last %zu | nr_inos %d",
|
||||
m, nr, a, arr_last, nr_inos);
|
||||
if (a < arr_last)
|
||||
prdebug(" arr->ms[%zu] = "MSF, a, MSA(&arr->ms[a]));
|
||||
if (m < nr)
|
||||
prdebug(" ms[%zu] = "MSF, m, MSA(&ms[m]));
|
||||
|
||||
/* setup comparison to copy lesser or only */
|
||||
if (a < arr_last && m < nr)
|
||||
cmp = compar_ms_meta_seq(&arr->ms[a], &ms[m]);
|
||||
else if (a < arr_last)
|
||||
cmp = -1;
|
||||
else
|
||||
cmp = 1;
|
||||
|
||||
prdebug(" cmp %d", cmp);
|
||||
|
||||
if (cmp == 0) {
|
||||
/* ignore both when they match */
|
||||
a++;
|
||||
m++;
|
||||
} else if (cmp < 0) {
|
||||
inos[nr_inos++] = arr->ms[a++].ino;
|
||||
} else { /* cmp > 0 */
|
||||
inos[nr_inos++] = ms[m++].ino;
|
||||
}
|
||||
}
|
||||
|
||||
/* if we didn't consume all the read meta_seq then we might need to clamp last */
|
||||
if (m < nr && compar_ms_meta_seq(&ms[m], last) <= 0) {
|
||||
*last = ms[m];
|
||||
last->ino--; /* must be non-zero, can't wrap */
|
||||
}
|
||||
|
||||
/* sort and remove duplicate inode numbers */
|
||||
if (nr_inos > 0) {
|
||||
qsort(inos, nr_inos, sizeof(inos[0]), compar_u64);
|
||||
for (i = 1, n = 1; i < nr_inos; i++) {
|
||||
if (inos[i] != inos[n - 1])
|
||||
inos[n++] = inos[i];
|
||||
}
|
||||
nr_inos = n;
|
||||
}
|
||||
|
||||
return nr_inos;
|
||||
}
|
||||
|
||||
/*
|
||||
* We're not really validating the result stream. We assume that the offset currently
|
||||
* points at an inode. We fill the caller's ms with its info then iterate through
|
||||
* all its results until the next ino.
|
||||
*/
|
||||
static ssize_t read_inode_results(void *buf, size_t off, size_t size,
|
||||
struct scoutfs_ioctl_meta_seq *found)
|
||||
{
|
||||
struct scoutfs_ioctl_raw_read_result res;
|
||||
size_t len;
|
||||
__le64 ms;
|
||||
|
||||
found->ino = 0;
|
||||
|
||||
while (off < size) {
|
||||
memcpy(&res, buf + off, sizeof(res));
|
||||
prdebug("res %u %u", res.type, res.size);
|
||||
|
||||
if (res.type == SCOUTFS_IOC_RAW_READ_RESULT_INODE && found->ino != 0)
|
||||
break;
|
||||
|
||||
off += sizeof(res);
|
||||
|
||||
switch(res.type) {
|
||||
case SCOUTFS_IOC_RAW_READ_RESULT_INODE:
|
||||
memcpy(&found->ino, buf + off, sizeof(__u64));
|
||||
memcpy(&ms, buf + off + sizeof(__u64) +
|
||||
offsetof(struct scoutfs_inode, meta_seq), sizeof(__le64));
|
||||
found->meta_seq = le64_to_cpu(ms);
|
||||
prdebug("res ino %llu ms %llu", found->ino, found->meta_seq);
|
||||
break;
|
||||
|
||||
case SCOUTFS_IOC_RAW_READ_RESULT_XATTR:
|
||||
len = strlen((char *)buf + off) + 1;
|
||||
prdebug("res xattr '%s' len %d: '%.*s'",
|
||||
(char *)buf + off,
|
||||
(int)(res.size - len),
|
||||
(int)(res.size - len),
|
||||
(char *)buf + off + len);
|
||||
break;
|
||||
};
|
||||
off += res.size;
|
||||
}
|
||||
|
||||
return off;
|
||||
}
|
||||
|
||||
/*
|
||||
* inos[] contains the inode numbers that we're interested in. Get
|
||||
* their info and update our array with what we find.
|
||||
*/
|
||||
static int read_inode_info(int fd, void *buf, struct meta_seq_array *arr, __u64 *inos, int nr_inos)
|
||||
{
|
||||
struct scoutfs_ioctl_raw_read_inode_info rii;
|
||||
struct scoutfs_ioctl_meta_seq found;
|
||||
struct scoutfs_ioctl_meta_seq ms;
|
||||
ssize_t off;
|
||||
size_t size;
|
||||
size_t ind;
|
||||
size_t added;
|
||||
int i;
|
||||
int ret;
|
||||
|
||||
rii = (struct scoutfs_ioctl_raw_read_inode_info) {
|
||||
.inos_ptr = (unsigned long)inos,
|
||||
.inos_count = nr_inos,
|
||||
.names_ptr = (unsigned long)opts.names,
|
||||
.names_count = opts.names_count,
|
||||
.results_ptr = (unsigned long)buf,
|
||||
.results_size = RESULTS_SIZE,
|
||||
};
|
||||
|
||||
begin_call(&stats.rii);
|
||||
ret = ioctl(fd, SCOUTFS_IOC_RAW_READ_INODE_INFO, &rii);
|
||||
if (ret < 0) {
|
||||
ret = -errno;
|
||||
prerror("READ_INODE_INFO ioctl failed: "NERRF, NERRA(ret));
|
||||
goto out;
|
||||
}
|
||||
end_call(&stats.rii);
|
||||
|
||||
prdebug("gii ret %d", ret);
|
||||
|
||||
off = 0;
|
||||
size = ret;
|
||||
set_ms(&found, 0, 0);
|
||||
added = 0;
|
||||
i = 0;
|
||||
|
||||
/* sort by ino so we can search by ino for updates */
|
||||
qsort(arr->ms, arr->nr, sizeof(arr->ms[0]), compar_ms_ino);
|
||||
|
||||
while (i < nr_inos) {
|
||||
/* find next ino */
|
||||
if (!found.ino && off < size) {
|
||||
off = read_inode_results(buf, off, size, &found);
|
||||
if (off < 0) {
|
||||
ret = off;
|
||||
goto out;
|
||||
}
|
||||
stats.rii.inos++;
|
||||
}
|
||||
|
||||
if (i < nr_inos && (!found.ino || inos[i] < found.ino)) {
|
||||
/* delete any record of inodes we didn't find */
|
||||
set_ms(&ms, UINT64_MAX, inos[i]);
|
||||
i++;
|
||||
|
||||
} else if (found.ino) {
|
||||
/* update/add arr to match the found ino */
|
||||
ms = found;
|
||||
if (i < nr_inos && inos[i] == found.ino)
|
||||
i++;
|
||||
set_ms(&found, 0, 0);
|
||||
}
|
||||
|
||||
/* find existing record */
|
||||
ind = bsearch_ind(&ms, arr->ms, arr->nr, sizeof(arr->ms[0]), compar_ms_ino);
|
||||
if (ind < arr->nr && arr->ms[ind].ino == ms.ino) {
|
||||
/* update existing ino, can be marking for deletion */
|
||||
prdebug("updating arr [%zu] ino %llu ms %llu -> %llu",
|
||||
ind, ms.ino, arr->ms[ind].meta_seq, ms.meta_seq);
|
||||
arr->ms[ind].meta_seq = ms.meta_seq;
|
||||
if (ms.meta_seq == UINT64_MAX)
|
||||
stats.remove++;
|
||||
else
|
||||
stats.update++;
|
||||
|
||||
} else if (ms.meta_seq != UINT64_MAX) {
|
||||
/* append new found, maintaining existing sorting */
|
||||
arr->ms[arr->nr + added] = ms;
|
||||
prdebug("adding arr [%zu] ino %llu ms %llu",
|
||||
arr->nr + added, ms.ino, ms.meta_seq);
|
||||
added++;
|
||||
stats.add++;
|
||||
}
|
||||
}
|
||||
|
||||
/* sort by seq again for next meta seq read */
|
||||
arr->nr += added;
|
||||
qsort(arr->ms, arr->nr, sizeof(arr->ms[0]), compar_ms_meta_seq);
|
||||
|
||||
/* and trim off any deletions */
|
||||
while (arr->nr > 0 && arr->ms[arr->nr - 1].meta_seq == UINT64_MAX)
|
||||
arr->nr--;
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static double secs(u64 a_ns, u64 b_ns)
|
||||
{
|
||||
return (double)(a_ns - b_ns) / NSEC_PER_SEC;
|
||||
}
|
||||
|
||||
static double nr_per_sec(u64 nr, __u64 nsec)
|
||||
{
|
||||
if (nsec == 0)
|
||||
return 0;
|
||||
|
||||
return (double)nr / secs(nsec, 0);
|
||||
}
|
||||
|
||||
static void print_stats(void)
|
||||
{
|
||||
u64 now = get_ns();
|
||||
|
||||
if (secs(now, stats.last) < 1.0)
|
||||
return;
|
||||
|
||||
if ((stats.lines++ % 16) == 0) {
|
||||
printf("%6s | %-29s | %-23s | %-23s\n",
|
||||
"", "inodes", "meta_seq", "inode_info");
|
||||
printf("%6s | %8s %6s %6s %6s | %7s %7s %7s | %7s %7s %7s\n",
|
||||
"now",
|
||||
"total", "add", "remove", "update",
|
||||
"calls", "inos", "inos/s",
|
||||
"calls", "inos", "inos/s");
|
||||
}
|
||||
|
||||
printf("%6.3lf | %8llu %6llu %6llu %6llu | %7llu %7llu %7.0lf | %7llu %7llu %7.0lf\n",
|
||||
secs(now, stats.start),
|
||||
stats.inodes, stats.add, stats.remove, stats.update,
|
||||
stats.rms.calls, stats.rms.inos, nr_per_sec(stats.rms.inos, stats.rms.time),
|
||||
stats.rii.calls, stats.rii.inos, nr_per_sec(stats.rms.inos, stats.rii.time));
|
||||
|
||||
stats.last = now;
|
||||
|
||||
{
|
||||
struct stats save = stats;
|
||||
stats = (struct stats) {
|
||||
.start = save.start,
|
||||
.last = save.last,
|
||||
.lines = save.lines,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
static void add_xattr(char *name)
|
||||
{
|
||||
size_t len_null;
|
||||
char *names;
|
||||
int ret;
|
||||
|
||||
len_null = strlen(name) + 1;
|
||||
names = realloc(opts.names, opts.names_size + len_null);
|
||||
if (!names) {
|
||||
ret = -errno;
|
||||
prerror("allocation of xattr names buffer failed: "NERRF, NERRA(ret));
|
||||
exit(3);
|
||||
}
|
||||
|
||||
memcpy(names + opts.names_size, name, len_null);
|
||||
|
||||
opts.names = names;
|
||||
opts.names_size += len_null;
|
||||
opts.names_count++;
|
||||
}
|
||||
|
||||
static bool parse_opts(int argc, char **argv)
|
||||
{
|
||||
bool usage = false;
|
||||
int c;
|
||||
|
||||
opts = (struct opts) {
|
||||
.debug = false,
|
||||
};
|
||||
|
||||
while ((c = getopt(argc, argv, "dp:x:")) != -1) {
|
||||
switch(c) {
|
||||
case 'd':
|
||||
opts.debug = true;
|
||||
break;
|
||||
case 'p':
|
||||
opts.path = strdup(optarg);
|
||||
break;
|
||||
case 'x':
|
||||
add_xattr(optarg);
|
||||
break;
|
||||
case '?':
|
||||
printf("Unknown option '%c'\n", optopt);
|
||||
usage = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!usage) {
|
||||
usage = true;
|
||||
if (!opts.path)
|
||||
printf("need -p path option\n");
|
||||
else
|
||||
usage = false;
|
||||
}
|
||||
|
||||
if (usage) {
|
||||
printf("\nusage:\n"
|
||||
" -d | enable verbose debugging output\n"
|
||||
" -p PATH | path to file system to watch\n"
|
||||
" -x NAME | try to read named xattr with inodes, can be many\n"
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct scoutfs_ioctl_raw_read_meta_seq rms = {0,};
|
||||
struct scoutfs_ioctl_meta_seq *ms;
|
||||
struct meta_seq_array arr = {0,};
|
||||
__u64 *inos = NULL;
|
||||
void *buf = NULL;
|
||||
int fd = -1;
|
||||
int nr_inos;
|
||||
int nr;
|
||||
int i;
|
||||
int ret;
|
||||
|
||||
if (!parse_opts(argc, argv))
|
||||
exit(1);
|
||||
|
||||
inos = calloc(INO_BATCH, sizeof(inos[0]));
|
||||
buf = malloc(RESULTS_SIZE);
|
||||
if (!inos || !buf) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
rms.results_ptr = (unsigned long)buf;
|
||||
rms.results_size = min(RESULTS_SIZE, INO_BATCH * sizeof(struct scoutfs_ioctl_meta_seq));
|
||||
|
||||
fd = open(opts.path, O_RDONLY);
|
||||
if (fd == -1) {
|
||||
perror("error");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
stats.start = get_ns();
|
||||
|
||||
for (;;) {
|
||||
set_ms(&rms.start, 0, 0);
|
||||
set_ms(&rms.end, UINT64_MAX, UINT64_MAX);
|
||||
|
||||
do {
|
||||
begin_call(&stats.rms);
|
||||
ret = ioctl(fd, SCOUTFS_IOC_RAW_READ_META_SEQ, &rms);
|
||||
if (ret < 0) {
|
||||
ret = -errno;
|
||||
prerror("READ_META_SEQ ioctl failed, "
|
||||
"start "MSF" end "MSF", "NERRF,
|
||||
MSA(&rms.start), MSA(&rms.end), NERRA(ret));
|
||||
goto out;
|
||||
}
|
||||
end_call(&stats.rms);
|
||||
stats.rms.inos += ret;
|
||||
|
||||
prdebug("RMS last "MSF" ret %d:", MSA(&rms.last), ret);
|
||||
|
||||
nr = ret;
|
||||
ms = buf;
|
||||
|
||||
if (opts.debug && nr > 0) {
|
||||
for (i = 0; i < nr; i++)
|
||||
prdebug(" [%u] "MSF"", i, MSA(&ms[i]));
|
||||
}
|
||||
|
||||
nr_inos = differing_inos(inos, &arr, &rms.start, &rms.last, ms, nr);
|
||||
|
||||
if (nr_inos > 0) {
|
||||
prdebug("diff inos %d:", nr_inos);
|
||||
for (i = 0; i < nr_inos; i++)
|
||||
prdebug(" [%u] %llu", i, inos[i]);
|
||||
|
||||
ret = expand_array(&arr, nr_inos) ?:
|
||||
read_inode_info(fd, buf, &arr, inos, nr_inos);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
stats.inodes = arr.nr;
|
||||
print_stats();
|
||||
|
||||
rms.start = rms.last;
|
||||
inc_ms(&rms.start);
|
||||
|
||||
} while (rms.last.meta_seq != UINT64_MAX || rms.last.ino != UINT64_MAX);
|
||||
|
||||
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
if (fd >= 0)
|
||||
close(fd);
|
||||
|
||||
free(inos);
|
||||
free(buf);
|
||||
free(arr.ms);
|
||||
free(opts.names);
|
||||
|
||||
return ret;
|
||||
}
|
||||
Reference in New Issue
Block a user