mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-09 13:23:14 +00:00
Compare commits
80 Commits
v1.10
...
zab/hold_c
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4784ccdfd5 | ||
|
|
778c2769df | ||
|
|
9e3529060e | ||
|
|
1672b3ecec | ||
|
|
55f9435fad | ||
|
|
072f6868d3 | ||
|
|
8a64b46a2f | ||
|
|
14901c39aa | ||
|
|
e095127ae9 | ||
|
|
a9da27444f | ||
|
|
49fe89741d | ||
|
|
847916860d | ||
|
|
564b942ead | ||
|
|
3d99fda0f6 | ||
|
|
6c0ab75477 | ||
|
|
89b238a5c4 | ||
|
|
05371b83f0 | ||
|
|
acafb869e7 | ||
|
|
74c5fe1115 | ||
|
|
2279e9657f | ||
|
|
707752a7bf | ||
|
|
0316c22026 | ||
|
|
5a1e5639c2 | ||
|
|
950963375b | ||
|
|
e52435b993 | ||
|
|
2b72c57cb0 | ||
|
|
9c67b2a42d | ||
|
|
0b38aeb5a4 | ||
|
|
2daf873983 | ||
|
|
904c5dce90 | ||
|
|
57c6d78df8 | ||
|
|
74e9d0f764 | ||
|
|
98eb0eb649 | ||
|
|
15de0c21c1 | ||
|
|
7b65767803 | ||
|
|
46640e4ff9 | ||
|
|
912906f050 | ||
|
|
ec02cf442b | ||
|
|
0e9cd1eea5 | ||
|
|
e18ea24561 | ||
|
|
723309ff75 | ||
|
|
9bfad7d324 | ||
|
|
448e0abacb | ||
|
|
2a6d827e7a | ||
|
|
e7bd1b45dc | ||
|
|
6ded240089 | ||
|
|
99a20bc383 | ||
|
|
18903ce500 | ||
|
|
b76e22ffcf | ||
|
|
d6863d6832 | ||
|
|
bb01a3990f | ||
|
|
409631ceb1 | ||
|
|
f1264c7e47 | ||
|
|
a61b8d9961 | ||
|
|
eac57a1f7a | ||
|
|
5512d5c03e | ||
|
|
8cf7be4651 | ||
|
|
3363b4fb79 | ||
|
|
ddb5cce2a5 | ||
|
|
1b0e9c45f4 | ||
|
|
2e2ccb6f61 | ||
|
|
01c8bba56d | ||
|
|
17cb1fe84b | ||
|
|
78ae87031b | ||
|
|
bf93ea73c4 | ||
|
|
a23e7478a0 | ||
|
|
9ba2ee5c88 | ||
|
|
fe33a492c2 | ||
|
|
77c0ff89fb | ||
|
|
7c2d83e2f8 | ||
|
|
40aa47c888 | ||
|
|
c1bd7bcce5 | ||
|
|
7720222588 | ||
|
|
fff07ce19c | ||
|
|
464de56d28 | ||
|
|
342c206550 | ||
|
|
fe4734d019 | ||
|
|
b1a43bb312 | ||
|
|
929703213f | ||
|
|
78279ffb4a |
@@ -1,6 +1,90 @@
|
||||
Versity ScoutFS Release Notes
|
||||
=============================
|
||||
|
||||
---
|
||||
v1.15
|
||||
\
|
||||
*Jul 17, 2023*
|
||||
|
||||
Process log btree merge splicing in multiple commits. This prevents a
|
||||
rare case where pending log merge completions contain more work than can
|
||||
be done in a single server commit, causing the server to trigger an
|
||||
assert shortly after starting.
|
||||
|
||||
Fix spurious EINVAL from data writes when data\_prealloc\_contig\_only was
|
||||
set to 0.
|
||||
|
||||
---
|
||||
v1.14
|
||||
\
|
||||
*Jun 29, 2023*
|
||||
|
||||
Add get\_referring\_entries ioctl for getting directory entries that
|
||||
refer to an inode.
|
||||
|
||||
Fix excessive CPU use in the move\_blocks interface when moving a large
|
||||
number of extents.
|
||||
|
||||
Reduce fragmented data allocation when contig\_only prealloc is not in
|
||||
use by more consistently allocating multi-block extents within each
|
||||
aligned prealloc region.
|
||||
|
||||
Avoid rare deadlock in metadata block cache recalim under both heavy
|
||||
load and memory pressure.
|
||||
|
||||
Fix crash when using quorum\_heartbeat\_timeout\_ms mount option.
|
||||
|
||||
---
|
||||
v1.13
|
||||
\
|
||||
*May 19, 2023*
|
||||
|
||||
Add the quorum\_heartbeat\_timeout\_ms mount option to set the quorum
|
||||
heartbeat timeout.
|
||||
|
||||
Change some task prioritization and allocation behavior of the quorum
|
||||
agent to help reduce delays in sending and receiving heartbeat messages.
|
||||
|
||||
---
|
||||
v1.12
|
||||
\
|
||||
*Apr 17, 2023*
|
||||
|
||||
Add the prepare-empty-data-device scoutfs command. A data device can be
|
||||
unused when no files have data blocks, perhaps because they're archived
|
||||
and offline. In this case the data device can be swapped out for
|
||||
another device without changes to the metadata device.
|
||||
|
||||
Fix an oversight which limited inode timestamps to second granularity
|
||||
for some operations. All operations now record timestamps with full
|
||||
nanosecond precision.
|
||||
|
||||
Fix spurious ENOENT failures when renaming from other directories into
|
||||
the root directory.
|
||||
|
||||
---
|
||||
v1.11
|
||||
\
|
||||
*Feb 2, 2023*
|
||||
|
||||
Fixed a free extent processing error that could prevent mount from
|
||||
proceeding when free data extents were sufficiently fragmented. It now
|
||||
properly handle very fragmented free extent maps.
|
||||
|
||||
Fixed a statfs server processing race that could return spurious errors
|
||||
and shut down the server. With the race closed statfs processing is
|
||||
reliable.
|
||||
|
||||
Fixed a rare livelock in the move\_blocks ioctl. With the right
|
||||
relationship between ioctl arguments and eventual file extent items the
|
||||
core loop in the move\_blocks ioctl could get stuck looping on an extent
|
||||
item and never return. The loop exit conditions were fixed and the loop
|
||||
will always advance through all extents.
|
||||
|
||||
Changed the 'print' scoutfs commands to flush the block cache for the
|
||||
devices. It was inconvenient to expect cache flushing to be a separate
|
||||
step to ensure consistency with remote node writes.
|
||||
|
||||
---
|
||||
v1.10
|
||||
\
|
||||
|
||||
@@ -976,6 +976,16 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
break;
|
||||
}
|
||||
|
||||
/* return partial if the server alloc can't dirty any more */
|
||||
if (scoutfs_alloc_meta_low(sb, alloc, 50 + extent_mod_blocks(src->root.height) +
|
||||
extent_mod_blocks(dst->root.height))) {
|
||||
if (WARN_ON_ONCE(!moved))
|
||||
ret = -ENOSPC;
|
||||
else
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
/* searching set start/len, finish initializing alloced extent */
|
||||
ext.map = found.map ? ext.start - found.start + found.map : 0;
|
||||
ext.flags = found.flags;
|
||||
@@ -1572,12 +1582,10 @@ out:
|
||||
* call the caller's callback. This assumes that the super it's reading
|
||||
* could be stale and will retry if it encounters stale blocks.
|
||||
*/
|
||||
int scoutfs_alloc_foreach(struct super_block *sb,
|
||||
scoutfs_alloc_foreach_cb_t cb, void *arg)
|
||||
int scoutfs_alloc_foreach(struct super_block *sb, scoutfs_alloc_foreach_cb_t cb, void *arg)
|
||||
{
|
||||
struct scoutfs_super_block *super = NULL;
|
||||
struct scoutfs_block_ref stale_refs[2] = {{0,}};
|
||||
struct scoutfs_block_ref refs[2] = {{0,}};
|
||||
DECLARE_SAVED_REFS(saved);
|
||||
int ret;
|
||||
|
||||
super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
|
||||
@@ -1586,26 +1594,18 @@ int scoutfs_alloc_foreach(struct super_block *sb,
|
||||
goto out;
|
||||
}
|
||||
|
||||
retry:
|
||||
ret = scoutfs_read_super(sb, super);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
do {
|
||||
ret = scoutfs_read_super(sb, super);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
refs[0] = super->logs_root.ref;
|
||||
refs[1] = super->srch_root.ref;
|
||||
ret = scoutfs_alloc_foreach_super(sb, super, cb, arg);
|
||||
|
||||
ret = scoutfs_block_check_stale(sb, ret, &saved, &super->logs_root.ref,
|
||||
&super->srch_root.ref);
|
||||
} while (ret == -ESTALE);
|
||||
|
||||
ret = scoutfs_alloc_foreach_super(sb, super, cb, arg);
|
||||
out:
|
||||
if (ret == -ESTALE) {
|
||||
if (memcmp(&stale_refs, &refs, sizeof(refs)) == 0) {
|
||||
ret = -EIO;
|
||||
} else {
|
||||
BUILD_BUG_ON(sizeof(stale_refs) != sizeof(refs));
|
||||
memcpy(stale_refs, refs, sizeof(stale_refs));
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
|
||||
kfree(super);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -677,7 +677,7 @@ out:
|
||||
int scoutfs_block_read_ref(struct super_block *sb, struct scoutfs_block_ref *ref, u32 magic,
|
||||
struct scoutfs_block **bl_ret)
|
||||
{
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_block_header *hdr;
|
||||
struct block_private *bp = NULL;
|
||||
bool retried = false;
|
||||
@@ -701,7 +701,7 @@ retry:
|
||||
set_bit(BLOCK_BIT_CRC_VALID, &bp->bits);
|
||||
}
|
||||
|
||||
if (hdr->magic != cpu_to_le32(magic) || hdr->fsid != super->hdr.fsid ||
|
||||
if (hdr->magic != cpu_to_le32(magic) || hdr->fsid != cpu_to_le64(sbi->fsid) ||
|
||||
hdr->seq != ref->seq || hdr->blkno != ref->blkno) {
|
||||
ret = -ESTALE;
|
||||
goto out;
|
||||
@@ -728,6 +728,36 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool stale_refs_match(struct scoutfs_block_ref *caller, struct scoutfs_block_ref *saved)
|
||||
{
|
||||
return !caller || (caller->blkno == saved->blkno && caller->seq == saved->seq);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if a read of a reference that gave ESTALE should be retried or
|
||||
* should generate a hard error. If this is the second time we got
|
||||
* ESTALE from the same refs then we return EIO and the caller should
|
||||
* stop. As long as we keep seeing different refs we'll return ESTALE
|
||||
* and the caller can keep trying.
|
||||
*/
|
||||
int scoutfs_block_check_stale(struct super_block *sb, int ret,
|
||||
struct scoutfs_block_saved_refs *saved,
|
||||
struct scoutfs_block_ref *a, struct scoutfs_block_ref *b)
|
||||
{
|
||||
if (ret == -ESTALE) {
|
||||
if (stale_refs_match(a, &saved->refs[0]) && stale_refs_match(b, &saved->refs[1])){
|
||||
ret = -EIO;
|
||||
} else {
|
||||
if (a)
|
||||
saved->refs[0] = *a;
|
||||
if (b)
|
||||
saved->refs[1] = *b;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void scoutfs_block_put(struct super_block *sb, struct scoutfs_block *bl)
|
||||
{
|
||||
if (!IS_ERR_OR_NULL(bl))
|
||||
@@ -797,7 +827,7 @@ int scoutfs_block_dirty_ref(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
u32 magic, struct scoutfs_block **bl_ret,
|
||||
u64 dirty_blkno, u64 *ref_blkno)
|
||||
{
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_block *cow_bl = NULL;
|
||||
struct scoutfs_block *bl = NULL;
|
||||
struct block_private *exist_bp = NULL;
|
||||
@@ -865,7 +895,7 @@ int scoutfs_block_dirty_ref(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
|
||||
hdr = bl->data;
|
||||
hdr->magic = cpu_to_le32(magic);
|
||||
hdr->fsid = super->hdr.fsid;
|
||||
hdr->fsid = cpu_to_le64(sbi->fsid);
|
||||
hdr->blkno = cpu_to_le64(bl->blkno);
|
||||
prandom_bytes(&hdr->seq, sizeof(hdr->seq));
|
||||
|
||||
@@ -1066,6 +1096,7 @@ static int block_shrink(struct shrinker *shrink, struct shrink_control *sc)
|
||||
struct super_block *sb = binf->sb;
|
||||
struct rhashtable_iter iter;
|
||||
struct block_private *bp;
|
||||
bool stop = false;
|
||||
unsigned long nr;
|
||||
u64 recently;
|
||||
|
||||
@@ -1077,7 +1108,6 @@ static int block_shrink(struct shrinker *shrink, struct shrink_control *sc)
|
||||
|
||||
nr = DIV_ROUND_UP(nr, SCOUTFS_BLOCK_LG_PAGES_PER);
|
||||
|
||||
restart:
|
||||
recently = accessed_recently(binf);
|
||||
rhashtable_walk_enter(&binf->ht, &iter);
|
||||
rhashtable_walk_start(&iter);
|
||||
@@ -1099,12 +1129,15 @@ restart:
|
||||
if (bp == NULL)
|
||||
break;
|
||||
if (bp == ERR_PTR(-EAGAIN)) {
|
||||
/* hard exit to wait for rcu rebalance to finish */
|
||||
rhashtable_walk_stop(&iter);
|
||||
rhashtable_walk_exit(&iter);
|
||||
scoutfs_inc_counter(sb, block_cache_shrink_restart);
|
||||
synchronize_rcu();
|
||||
goto restart;
|
||||
/*
|
||||
* We can be called from reclaim in the allocation
|
||||
* to resize the hash table itself. We have to
|
||||
* return so that the caller can proceed and
|
||||
* enable hash table iteration again.
|
||||
*/
|
||||
scoutfs_inc_counter(sb, block_cache_shrink_stop);
|
||||
stop = true;
|
||||
break;
|
||||
}
|
||||
|
||||
scoutfs_inc_counter(sb, block_cache_shrink_next);
|
||||
@@ -1127,8 +1160,11 @@ restart:
|
||||
rhashtable_walk_stop(&iter);
|
||||
rhashtable_walk_exit(&iter);
|
||||
out:
|
||||
return min_t(u64, (u64)atomic_read(&binf->total_inserted) * SCOUTFS_BLOCK_LG_PAGES_PER,
|
||||
INT_MAX);
|
||||
if (stop)
|
||||
return -1;
|
||||
else
|
||||
return min_t(u64, INT_MAX,
|
||||
(u64)atomic_read(&binf->total_inserted) * SCOUTFS_BLOCK_LG_PAGES_PER);
|
||||
}
|
||||
|
||||
struct sm_block_completion {
|
||||
|
||||
@@ -13,6 +13,17 @@ struct scoutfs_block {
|
||||
void *priv;
|
||||
};
|
||||
|
||||
struct scoutfs_block_saved_refs {
|
||||
struct scoutfs_block_ref refs[2];
|
||||
};
|
||||
|
||||
#define DECLARE_SAVED_REFS(name) \
|
||||
struct scoutfs_block_saved_refs name = {{{0,}}}
|
||||
|
||||
int scoutfs_block_check_stale(struct super_block *sb, int ret,
|
||||
struct scoutfs_block_saved_refs *saved,
|
||||
struct scoutfs_block_ref *a, struct scoutfs_block_ref *b);
|
||||
|
||||
int scoutfs_block_read_ref(struct super_block *sb, struct scoutfs_block_ref *ref, u32 magic,
|
||||
struct scoutfs_block **bl_ret);
|
||||
void scoutfs_block_put(struct super_block *sb, struct scoutfs_block *bl);
|
||||
|
||||
@@ -356,7 +356,6 @@ static int client_greeting(struct super_block *sb,
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct client_info *client = sbi->client_info;
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_net_greeting *gr = resp;
|
||||
bool new_server;
|
||||
int ret;
|
||||
@@ -371,9 +370,9 @@ static int client_greeting(struct super_block *sb,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (gr->fsid != super->hdr.fsid) {
|
||||
if (gr->fsid != cpu_to_le64(sbi->fsid)) {
|
||||
scoutfs_warn(sb, "server greeting response fsid 0x%llx did not match client fsid 0x%llx",
|
||||
le64_to_cpu(gr->fsid), le64_to_cpu(super->hdr.fsid));
|
||||
le64_to_cpu(gr->fsid), sbi->fsid);
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
@@ -476,7 +475,6 @@ static void scoutfs_client_connect_worker(struct work_struct *work)
|
||||
connect_dwork.work);
|
||||
struct super_block *sb = client->sb;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct scoutfs_net_greeting greet;
|
||||
struct sockaddr_in sin;
|
||||
@@ -508,7 +506,7 @@ static void scoutfs_client_connect_worker(struct work_struct *work)
|
||||
goto out;
|
||||
|
||||
/* send a greeting to verify endpoints of each connection */
|
||||
greet.fsid = super->hdr.fsid;
|
||||
greet.fsid = cpu_to_le64(sbi->fsid);
|
||||
greet.fmt_vers = cpu_to_le64(sbi->fmt_vers);
|
||||
greet.server_term = cpu_to_le64(client->server_term);
|
||||
greet.rid = cpu_to_le64(sbi->rid);
|
||||
|
||||
@@ -34,7 +34,7 @@
|
||||
EXPAND_COUNTER(block_cache_shrink_next) \
|
||||
EXPAND_COUNTER(block_cache_shrink_recent) \
|
||||
EXPAND_COUNTER(block_cache_shrink_remove) \
|
||||
EXPAND_COUNTER(block_cache_shrink_restart) \
|
||||
EXPAND_COUNTER(block_cache_shrink_stop) \
|
||||
EXPAND_COUNTER(btree_compact_values) \
|
||||
EXPAND_COUNTER(btree_compact_values_enomem) \
|
||||
EXPAND_COUNTER(btree_delete) \
|
||||
@@ -166,6 +166,7 @@
|
||||
EXPAND_COUNTER(quorum_recv_resignation) \
|
||||
EXPAND_COUNTER(quorum_recv_vote) \
|
||||
EXPAND_COUNTER(quorum_send_heartbeat) \
|
||||
EXPAND_COUNTER(quorum_send_heartbeat_dropped) \
|
||||
EXPAND_COUNTER(quorum_send_resignation) \
|
||||
EXPAND_COUNTER(quorum_send_request) \
|
||||
EXPAND_COUNTER(quorum_send_vote) \
|
||||
@@ -187,8 +188,6 @@
|
||||
EXPAND_COUNTER(srch_search_retry_empty) \
|
||||
EXPAND_COUNTER(srch_search_sorted) \
|
||||
EXPAND_COUNTER(srch_search_sorted_block) \
|
||||
EXPAND_COUNTER(srch_search_stale_eio) \
|
||||
EXPAND_COUNTER(srch_search_stale_retry) \
|
||||
EXPAND_COUNTER(srch_search_xattrs) \
|
||||
EXPAND_COUNTER(srch_read_stale) \
|
||||
EXPAND_COUNTER(statfs) \
|
||||
|
||||
@@ -456,11 +456,11 @@ static int alloc_block(struct super_block *sb, struct inode *inode,
|
||||
|
||||
} else {
|
||||
/*
|
||||
* Preallocation of aligned regions only preallocates if
|
||||
* the aligned region contains no extents at all. This
|
||||
* could be fooled by offline sparse extents but we
|
||||
* don't want to iterate over all offline extents in the
|
||||
* aligned region.
|
||||
* Preallocation within aligned regions tries to
|
||||
* allocate an extent to fill the hole in the region
|
||||
* that contains iblock. We'd have to add a bit of plumbing
|
||||
* to find previous extents so we only search for a next
|
||||
* extent from the front of the region and from iblock.
|
||||
*/
|
||||
div64_u64_rem(iblock, opts.data_prealloc_blocks, &rem);
|
||||
start = iblock - rem;
|
||||
@@ -468,8 +468,20 @@ static int alloc_block(struct super_block *sb, struct inode *inode,
|
||||
ret = scoutfs_ext_next(sb, &data_ext_ops, &args, start, 1, &found);
|
||||
if (ret < 0 && ret != -ENOENT)
|
||||
goto out;
|
||||
if (found.len && found.start < start + count)
|
||||
count = 1;
|
||||
|
||||
/* trim count if there's an extent in the region before iblock */
|
||||
if (found.len && found.start < iblock) {
|
||||
count -= iblock - start;
|
||||
start = iblock;
|
||||
/* see if there's also an extent after iblock */
|
||||
ret = scoutfs_ext_next(sb, &data_ext_ops, &args, iblock, 1, &found);
|
||||
if (ret < 0 && ret != -ENOENT)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* trim count by next extent after iblock */
|
||||
if (found.len && found.start > start && found.start < start + count)
|
||||
count = (found.start - start);
|
||||
}
|
||||
|
||||
/* overall prealloc limit */
|
||||
@@ -1192,9 +1204,9 @@ static void truncate_inode_pages_extent(struct inode *inode, u64 start, u64 len)
|
||||
* explained above the move_blocks ioctl argument structure definition.
|
||||
*
|
||||
* The caller has processed the ioctl args and performed the most basic
|
||||
* inode checks, but we perform more detailed inode checks once we have
|
||||
* the inode lock and refreshed inodes. Our job is to safely lock the
|
||||
* two files and move the extents.
|
||||
* argument sanity and inode checks, but we perform more detailed inode
|
||||
* checks once we have the inode lock and refreshed inodes. Our job is
|
||||
* to safely lock the two files and move the extents.
|
||||
*/
|
||||
#define MOVE_DATA_EXTENTS_PER_HOLD 16
|
||||
int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
|
||||
@@ -1253,6 +1265,16 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
|
||||
from_iblock = from_off >> SCOUTFS_BLOCK_SM_SHIFT;
|
||||
count = (byte_len + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT;
|
||||
to_iblock = to_off >> SCOUTFS_BLOCK_SM_SHIFT;
|
||||
from_start = from_iblock;
|
||||
|
||||
/* only move extent blocks inside i_size, careful not to wrap */
|
||||
from_size = i_size_read(from);
|
||||
if (from_off >= from_size) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
if (from_off + byte_len > from_size)
|
||||
count = ((from_size - from_off) + SCOUTFS_BLOCK_SM_MASK) >> SCOUTFS_BLOCK_SM_SHIFT;
|
||||
|
||||
if (S_ISDIR(from->i_mode) || S_ISDIR(to->i_mode)) {
|
||||
ret = -EISDIR;
|
||||
@@ -1320,7 +1342,7 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
|
||||
|
||||
/* find the next extent to move */
|
||||
ret = scoutfs_ext_next(sb, &data_ext_ops, &from_args,
|
||||
from_iblock, 1, &ext);
|
||||
from_start, 1, &ext);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT) {
|
||||
done = true;
|
||||
@@ -1329,9 +1351,8 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
|
||||
break;
|
||||
}
|
||||
|
||||
/* only move extents within count and i_size */
|
||||
if (ext.start >= from_iblock + count ||
|
||||
ext.start >= i_size_read(from)) {
|
||||
/* done if next extent starts after moving region */
|
||||
if (ext.start >= from_iblock + count) {
|
||||
done = true;
|
||||
ret = 0;
|
||||
break;
|
||||
@@ -1339,13 +1360,15 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
|
||||
|
||||
from_start = max(ext.start, from_iblock);
|
||||
map = ext.map + (from_start - ext.start);
|
||||
len = min3(from_iblock + count,
|
||||
round_up((u64)i_size_read(from),
|
||||
SCOUTFS_BLOCK_SM_SIZE),
|
||||
ext.start + ext.len) - from_start;
|
||||
|
||||
len = min(from_iblock + count, ext.start + ext.len) - from_start;
|
||||
to_start = to_iblock + (from_start - from_iblock);
|
||||
|
||||
/* we'd get stuck, shouldn't happen */
|
||||
if (WARN_ON_ONCE(len == 0)) {
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (is_stage) {
|
||||
ret = scoutfs_ext_next(sb, &data_ext_ops, &to_args,
|
||||
to_start, 1, &off_ext);
|
||||
@@ -1407,6 +1430,12 @@ int scoutfs_data_move_blocks(struct inode *from, u64 from_off,
|
||||
i_size_read(from);
|
||||
i_size_write(to, to_size);
|
||||
}
|
||||
|
||||
/* find next after moved extent, avoiding wrapping */
|
||||
if (from_start + len < from_start)
|
||||
from_start = from_iblock + count + 1;
|
||||
else
|
||||
from_start += len;
|
||||
}
|
||||
|
||||
|
||||
|
||||
119
kmod/src/dir.c
119
kmod/src/dir.c
@@ -1253,75 +1253,93 @@ int scoutfs_symlink_drop(struct super_block *sb, u64 ino,
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the next link backref key for the given ino starting from the
|
||||
* given dir inode and final entry position. If we find a backref item
|
||||
* we add an allocated copy of it to the head of the caller's list.
|
||||
* Find the next link backref items for the given ino starting from the
|
||||
* given dir inode and final entry position. For each backref item we
|
||||
* add an allocated copy of it to the head of the caller's list.
|
||||
*
|
||||
* Returns 0 if we added an entry, -ENOENT if we didn't, and -errno for
|
||||
* search errors.
|
||||
* Callers who are building a path can add one entry for each parent.
|
||||
* They're left with a list of entries from the root down in list order.
|
||||
*
|
||||
* Callers who are gathering multiple entries for one inode get the
|
||||
* entries in the opposite order that their items are found.
|
||||
*
|
||||
* Returns +ve for number of entries added, -ENOENT if no entries were
|
||||
* found, or -errno on error. It weirdly won't return 0, but early
|
||||
* callers preferred -ENOENT so we use that for the case of no entries.
|
||||
*
|
||||
* Callers are comfortable with the race inherent to incrementally
|
||||
* building up a path with individual locked backref item lookups.
|
||||
* gathering backrefs across multiple lock acquisitions.
|
||||
*/
|
||||
int scoutfs_dir_add_next_linkref(struct super_block *sb, u64 ino,
|
||||
u64 dir_ino, u64 dir_pos,
|
||||
struct list_head *list)
|
||||
int scoutfs_dir_add_next_linkrefs(struct super_block *sb, u64 ino, u64 dir_ino, u64 dir_pos,
|
||||
int count, struct list_head *list)
|
||||
{
|
||||
struct scoutfs_link_backref_entry *prev_ent = NULL;
|
||||
struct scoutfs_link_backref_entry *ent = NULL;
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
struct scoutfs_key last_key;
|
||||
struct scoutfs_key key;
|
||||
int nr = 0;
|
||||
int len;
|
||||
int ret;
|
||||
|
||||
ent = kmalloc(offsetof(struct scoutfs_link_backref_entry,
|
||||
dent.name[SCOUTFS_NAME_LEN]), GFP_KERNEL);
|
||||
if (!ent) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&ent->head);
|
||||
|
||||
init_dirent_key(&key, SCOUTFS_LINK_BACKREF_TYPE, ino, dir_ino, dir_pos);
|
||||
init_dirent_key(&last_key, SCOUTFS_LINK_BACKREF_TYPE, ino, U64_MAX,
|
||||
U64_MAX);
|
||||
init_dirent_key(&last_key, SCOUTFS_LINK_BACKREF_TYPE, ino, U64_MAX, U64_MAX);
|
||||
|
||||
ret = scoutfs_lock_ino(sb, SCOUTFS_LOCK_READ, 0, ino, &lock);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
ret = scoutfs_item_next(sb, &key, &last_key, &ent->dent,
|
||||
dirent_bytes(SCOUTFS_NAME_LEN), lock);
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
|
||||
lock = NULL;
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
while (nr < count) {
|
||||
ent = kmalloc(offsetof(struct scoutfs_link_backref_entry,
|
||||
dent.name[SCOUTFS_NAME_LEN]), GFP_NOFS);
|
||||
if (!ent) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
len = ret - sizeof(struct scoutfs_dirent);
|
||||
if (len < 1 || len > SCOUTFS_NAME_LEN) {
|
||||
scoutfs_corruption(sb, SC_DIRENT_BACKREF_NAME_LEN,
|
||||
corrupt_dirent_backref_name_len,
|
||||
"ino %llu dir_ino %llu pos %llu key "SK_FMT" len %d",
|
||||
ino, dir_ino, dir_pos, SK_ARG(&key), len);
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
INIT_LIST_HEAD(&ent->head);
|
||||
|
||||
ret = scoutfs_item_next(sb, &key, &last_key, &ent->dent,
|
||||
dirent_bytes(SCOUTFS_NAME_LEN), lock);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT && prev_ent)
|
||||
prev_ent->last = true;
|
||||
goto out;
|
||||
}
|
||||
|
||||
len = ret - sizeof(struct scoutfs_dirent);
|
||||
if (len < 1 || len > SCOUTFS_NAME_LEN) {
|
||||
scoutfs_corruption(sb, SC_DIRENT_BACKREF_NAME_LEN,
|
||||
corrupt_dirent_backref_name_len,
|
||||
"ino %llu dir_ino %llu pos %llu key "SK_FMT" len %d",
|
||||
ino, dir_ino, dir_pos, SK_ARG(&key), len);
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ent->dir_ino = le64_to_cpu(key.skd_major);
|
||||
ent->dir_pos = le64_to_cpu(key.skd_minor);
|
||||
ent->name_len = len;
|
||||
ent->d_type = dentry_type(ent->dent.type);
|
||||
ent->last = false;
|
||||
|
||||
trace_scoutfs_dir_add_next_linkref_found(sb, ino, ent->dir_ino, ent->dir_pos,
|
||||
ent->name_len);
|
||||
|
||||
list_add(&ent->head, list);
|
||||
prev_ent = ent;
|
||||
ent = NULL;
|
||||
nr++;
|
||||
scoutfs_key_inc(&key);
|
||||
}
|
||||
|
||||
list_add(&ent->head, list);
|
||||
ent->dir_ino = le64_to_cpu(key.skd_major);
|
||||
ent->dir_pos = le64_to_cpu(key.skd_minor);
|
||||
ent->name_len = len;
|
||||
ret = 0;
|
||||
out:
|
||||
trace_scoutfs_dir_add_next_linkref(sb, ino, dir_ino, dir_pos, ret,
|
||||
ent ? ent->dir_ino : 0,
|
||||
ent ? ent->dir_pos : 0,
|
||||
ent ? ent->name_len : 0);
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_READ);
|
||||
trace_scoutfs_dir_add_next_linkrefs(sb, ino, dir_ino, dir_pos, count, nr, ret);
|
||||
|
||||
if (ent && list_empty(&ent->head))
|
||||
kfree(ent);
|
||||
return ret;
|
||||
kfree(ent);
|
||||
return nr ?: ret;
|
||||
}
|
||||
|
||||
static u64 first_backref_dir_ino(struct list_head *list)
|
||||
@@ -1396,7 +1414,7 @@ retry:
|
||||
}
|
||||
|
||||
/* get the next link name to the given inode */
|
||||
ret = scoutfs_dir_add_next_linkref(sb, ino, dir_ino, dir_pos, list);
|
||||
ret = scoutfs_dir_add_next_linkrefs(sb, ino, dir_ino, dir_pos, 1, list);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
@@ -1404,7 +1422,7 @@ retry:
|
||||
par_ino = first_backref_dir_ino(list);
|
||||
while (par_ino != SCOUTFS_ROOT_INO) {
|
||||
|
||||
ret = scoutfs_dir_add_next_linkref(sb, par_ino, 0, 0, list);
|
||||
ret = scoutfs_dir_add_next_linkrefs(sb, par_ino, 0, 0, 1, list);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT) {
|
||||
/* restart if there was no parent component */
|
||||
@@ -1416,6 +1434,8 @@ retry:
|
||||
|
||||
par_ino = first_backref_dir_ino(list);
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret < 0)
|
||||
scoutfs_dir_free_backref_path(sb, list);
|
||||
@@ -1443,6 +1463,11 @@ static int item_d_ancestor(struct super_block *sb, u64 p1, u64 p2, u64 *p_ret)
|
||||
|
||||
*p_ret = 0;
|
||||
|
||||
if (p2 == SCOUTFS_ROOT_INO) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_dir_get_backref_path(sb, p2, 0, 0, &list);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
@@ -15,6 +15,8 @@ struct scoutfs_link_backref_entry {
|
||||
u64 dir_ino;
|
||||
u64 dir_pos;
|
||||
u16 name_len;
|
||||
u8 d_type;
|
||||
bool last;
|
||||
struct scoutfs_dirent dent;
|
||||
/* the full name is allocated and stored in dent.name[] */
|
||||
};
|
||||
@@ -24,9 +26,8 @@ int scoutfs_dir_get_backref_path(struct super_block *sb, u64 ino, u64 dir_ino,
|
||||
void scoutfs_dir_free_backref_path(struct super_block *sb,
|
||||
struct list_head *list);
|
||||
|
||||
int scoutfs_dir_add_next_linkref(struct super_block *sb, u64 ino,
|
||||
u64 dir_ino, u64 dir_pos,
|
||||
struct list_head *list);
|
||||
int scoutfs_dir_add_next_linkrefs(struct super_block *sb, u64 ino, u64 dir_ino, u64 dir_pos,
|
||||
int count, struct list_head *list);
|
||||
|
||||
int scoutfs_symlink_drop(struct super_block *sb, u64 ino,
|
||||
struct scoutfs_lock *lock, u64 i_size);
|
||||
|
||||
@@ -114,8 +114,8 @@ static struct dentry *scoutfs_get_parent(struct dentry *child)
|
||||
int ret;
|
||||
u64 ino;
|
||||
|
||||
ret = scoutfs_dir_add_next_linkref(sb, scoutfs_ino(inode), 0, 0, &list);
|
||||
if (ret)
|
||||
ret = scoutfs_dir_add_next_linkrefs(sb, scoutfs_ino(inode), 0, 0, 1, &list);
|
||||
if (ret < 0)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
ent = list_first_entry(&list, struct scoutfs_link_backref_entry, head);
|
||||
@@ -138,9 +138,9 @@ static int scoutfs_get_name(struct dentry *parent, char *name,
|
||||
LIST_HEAD(list);
|
||||
int ret;
|
||||
|
||||
ret = scoutfs_dir_add_next_linkref(sb, scoutfs_ino(inode), dir_ino,
|
||||
0, &list);
|
||||
if (ret)
|
||||
ret = scoutfs_dir_add_next_linkrefs(sb, scoutfs_ino(inode), dir_ino,
|
||||
0, 1, &list);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
ret = -ENOENT;
|
||||
|
||||
@@ -78,11 +78,6 @@ struct forest_refs {
|
||||
struct scoutfs_block_ref logs_ref;
|
||||
};
|
||||
|
||||
/* initialize some refs that initially aren't equal */
|
||||
#define DECLARE_STALE_TRACKING_SUPER_REFS(a, b) \
|
||||
struct forest_refs a = {{cpu_to_le64(0),}}; \
|
||||
struct forest_refs b = {{cpu_to_le64(1),}}
|
||||
|
||||
struct forest_bloom_nrs {
|
||||
unsigned int nrs[SCOUTFS_FOREST_BLOOM_NRS];
|
||||
};
|
||||
@@ -136,11 +131,11 @@ static struct scoutfs_block *read_bloom_ref(struct super_block *sb, struct scout
|
||||
int scoutfs_forest_next_hint(struct super_block *sb, struct scoutfs_key *key,
|
||||
struct scoutfs_key *next)
|
||||
{
|
||||
DECLARE_STALE_TRACKING_SUPER_REFS(prev_refs, refs);
|
||||
struct scoutfs_net_roots roots;
|
||||
struct scoutfs_btree_root item_root;
|
||||
struct scoutfs_log_trees *lt;
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
DECLARE_SAVED_REFS(saved);
|
||||
struct scoutfs_key found;
|
||||
struct scoutfs_key ltk;
|
||||
bool checked_fs;
|
||||
@@ -155,8 +150,6 @@ retry:
|
||||
goto out;
|
||||
|
||||
trace_scoutfs_forest_using_roots(sb, &roots.fs_root, &roots.logs_root);
|
||||
refs.fs_ref = roots.fs_root.ref;
|
||||
refs.logs_ref = roots.logs_root.ref;
|
||||
|
||||
scoutfs_key_init_log_trees(<k, 0, 0);
|
||||
checked_fs = false;
|
||||
@@ -212,14 +205,10 @@ retry:
|
||||
}
|
||||
}
|
||||
|
||||
if (ret == -ESTALE) {
|
||||
if (memcmp(&prev_refs, &refs, sizeof(refs)) == 0)
|
||||
return -EIO;
|
||||
prev_refs = refs;
|
||||
ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.fs_root.ref, &roots.logs_root.ref);
|
||||
if (ret == -ESTALE)
|
||||
goto retry;
|
||||
}
|
||||
out:
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -541,9 +530,8 @@ void scoutfs_forest_dec_inode_count(struct super_block *sb)
|
||||
|
||||
/*
|
||||
* Return the total inode count from the super block and all the
|
||||
* log_btrees it references. This assumes it's working with a block
|
||||
* reference hierarchy that should be fully consistent. If we see
|
||||
* ESTALE we've hit persistent corruption.
|
||||
* log_btrees it references. ESTALE from read blocks is returned to the
|
||||
* caller who is expected to retry or return hard errors.
|
||||
*/
|
||||
int scoutfs_forest_inode_count(struct super_block *sb, struct scoutfs_super_block *super,
|
||||
u64 *inode_count)
|
||||
@@ -572,8 +560,6 @@ int scoutfs_forest_inode_count(struct super_block *sb, struct scoutfs_super_bloc
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
else if (ret == -ESTALE)
|
||||
ret = -EIO;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -683,16 +683,19 @@ struct scoutfs_xattr_totl_val {
|
||||
#define SCOUTFS_QUORUM_ELECT_VAR_MS 100
|
||||
|
||||
/*
|
||||
* Once a leader is elected they send out heartbeats at regular
|
||||
* intervals to force members to wait the much longer heartbeat timeout.
|
||||
* Once heartbeat timeout expires without receiving a heartbeat they'll
|
||||
* switch over the performing elections.
|
||||
* Once a leader is elected they send heartbeat messages to all quorum
|
||||
* members at regular intervals to force members to wait the much longer
|
||||
* heartbeat timeout. Once the heartbeat timeout expires without
|
||||
* receiving a heartbeat message a member will start an election.
|
||||
*
|
||||
* These determine how long it could take members to notice that a
|
||||
* leader has gone silent and start to elect a new leader.
|
||||
* leader has gone silent and start to elect a new leader. The
|
||||
* heartbeat timeout can be changed at run time by options.
|
||||
*/
|
||||
#define SCOUTFS_QUORUM_HB_IVAL_MS 100
|
||||
#define SCOUTFS_QUORUM_HB_TIMEO_MS (5 * MSEC_PER_SEC)
|
||||
#define SCOUTFS_QUORUM_MIN_HB_TIMEO_MS (2 * MSEC_PER_SEC)
|
||||
#define SCOUTFS_QUORUM_DEF_HB_TIMEO_MS (10 * MSEC_PER_SEC)
|
||||
#define SCOUTFS_QUORUM_MAX_HB_TIMEO_MS (60 * MSEC_PER_SEC)
|
||||
|
||||
/*
|
||||
* A newly elected leader will give fencing some time before giving up and
|
||||
|
||||
106
kmod/src/ioctl.c
106
kmod/src/ioctl.c
@@ -1398,6 +1398,110 @@ out:
|
||||
return ret ?: nr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy entries that point to an inode to the user's buffer. We copy to
|
||||
* userspace from copies of the entries that are acquired under a lock
|
||||
* so that we don't fault while holding cluster locks. It also gives us
|
||||
* a chance to limit the amount of work under each lock hold.
|
||||
*/
|
||||
static long scoutfs_ioc_get_referring_entries(struct file *file, unsigned long arg)
|
||||
{
|
||||
struct super_block *sb = file_inode(file)->i_sb;
|
||||
struct scoutfs_ioctl_get_referring_entries gre;
|
||||
struct scoutfs_link_backref_entry *bref = NULL;
|
||||
struct scoutfs_link_backref_entry *bref_tmp;
|
||||
struct scoutfs_ioctl_dirent __user *uent;
|
||||
struct scoutfs_ioctl_dirent ent;
|
||||
LIST_HEAD(list);
|
||||
u64 copied;
|
||||
int name_len;
|
||||
int bytes;
|
||||
long nr;
|
||||
int ret;
|
||||
|
||||
if (!capable(CAP_DAC_READ_SEARCH))
|
||||
return -EPERM;
|
||||
|
||||
if (copy_from_user(&gre, (void __user *)arg, sizeof(gre)))
|
||||
return -EFAULT;
|
||||
|
||||
uent = (void __user *)(unsigned long)gre.entries_ptr;
|
||||
copied = 0;
|
||||
nr = 0;
|
||||
|
||||
/* use entry as cursor between calls */
|
||||
ent.dir_ino = gre.dir_ino;
|
||||
ent.dir_pos = gre.dir_pos;
|
||||
|
||||
for (;;) {
|
||||
ret = scoutfs_dir_add_next_linkrefs(sb, gre.ino, ent.dir_ino, ent.dir_pos, 1024,
|
||||
&list);
|
||||
if (ret < 0) {
|
||||
if (ret == -ENOENT)
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* _add_next adds each entry to the head, _reverse for key order */
|
||||
list_for_each_entry_safe_reverse(bref, bref_tmp, &list, head) {
|
||||
list_del_init(&bref->head);
|
||||
|
||||
name_len = bref->name_len;
|
||||
bytes = ALIGN(offsetof(struct scoutfs_ioctl_dirent, name[name_len + 1]),
|
||||
16);
|
||||
if (copied + bytes > gre.entries_bytes) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ent.dir_ino = bref->dir_ino;
|
||||
ent.dir_pos = bref->dir_pos;
|
||||
ent.ino = gre.ino;
|
||||
ent.entry_bytes = bytes;
|
||||
ent.flags = bref->last ? SCOUTFS_IOCTL_DIRENT_FLAG_LAST : 0;
|
||||
ent.d_type = bref->d_type;
|
||||
ent.name_len = name_len;
|
||||
|
||||
if (copy_to_user(uent, &ent, sizeof(struct scoutfs_ioctl_dirent)) ||
|
||||
copy_to_user(&uent->name[0], bref->dent.name, name_len) ||
|
||||
put_user('\0', &uent->name[name_len])) {
|
||||
ret = -EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
kfree(bref);
|
||||
bref = NULL;
|
||||
|
||||
uent = (void __user *)uent + bytes;
|
||||
copied += bytes;
|
||||
nr++;
|
||||
|
||||
if (nr == LONG_MAX || (ent.flags & SCOUTFS_IOCTL_DIRENT_FLAG_LAST)) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* advance cursor pos from last copied entry */
|
||||
if (++ent.dir_pos == 0) {
|
||||
if (++ent.dir_ino == 0) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
kfree(bref);
|
||||
list_for_each_entry_safe(bref, bref_tmp, &list, head) {
|
||||
list_del_init(&bref->head);
|
||||
kfree(bref);
|
||||
}
|
||||
|
||||
return nr ?: ret;
|
||||
}
|
||||
|
||||
long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
{
|
||||
switch (cmd) {
|
||||
@@ -1433,6 +1537,8 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
|
||||
return scoutfs_ioc_read_xattr_totals(file, arg);
|
||||
case SCOUTFS_IOC_GET_ALLOCATED_INOS:
|
||||
return scoutfs_ioc_get_allocated_inos(file, arg);
|
||||
case SCOUTFS_IOC_GET_REFERRING_ENTRIES:
|
||||
return scoutfs_ioc_get_referring_entries(file, arg);
|
||||
}
|
||||
|
||||
return -ENOTTY;
|
||||
|
||||
114
kmod/src/ioctl.h
114
kmod/src/ioctl.h
@@ -559,4 +559,118 @@ struct scoutfs_ioctl_get_allocated_inos {
|
||||
#define SCOUTFS_IOC_GET_ALLOCATED_INOS \
|
||||
_IOW(SCOUTFS_IOCTL_MAGIC, 16, struct scoutfs_ioctl_get_allocated_inos)
|
||||
|
||||
/*
|
||||
* Get directory entries that refer to a specific inode.
|
||||
*
|
||||
* @ino: The target ino that we're finding referring entries to.
|
||||
* Constant across all the calls that make up an iteration over all the
|
||||
* inode's entries.
|
||||
*
|
||||
* @dir_ino: The inode number of a directory containing the entry to our
|
||||
* inode to search from. If this parent directory contains no more
|
||||
* entries to our inode then we'll search through other parent directory
|
||||
* inodes in inode order.
|
||||
*
|
||||
* @dir_pos: The position in the dir_ino parent directory of the entry
|
||||
* to our inode to search from. If there is no entry at this position
|
||||
* then we'll search through other entry positions in increasing order.
|
||||
* If we exhaust the parent directory then we'll search through
|
||||
* additional parent directories in inode order.
|
||||
*
|
||||
* @entries_ptr: A pointer to the buffer where found entries will be
|
||||
* stored. The pointer must be aligned to 16 bytes.
|
||||
*
|
||||
* @entries_bytes: The size of the buffer that will contain entries.
|
||||
*
|
||||
* To start iterating set the desired target ino, dir_ino to 0, dir_pos
|
||||
* to 0, and set result_ptr and _bytes to a sufficiently large buffer.
|
||||
* Each entry struct that's stored in the buffer adds some overhead so a
|
||||
* large multiple of the largest possible name is a reasonable choice.
|
||||
* (A few multiples of PATH_MAX perhaps.)
|
||||
*
|
||||
* Each call returns the total number of entries that were stored in the
|
||||
* entries buffer. Zero is returned when the search was successful and
|
||||
* no referring entries were found. The entries can be iterated over by
|
||||
* advancing each starting struct offset by the total number of bytes in
|
||||
* each entry. If the _LAST flag is set on an entry then there were no
|
||||
* more entries referring to the inode at the time of the call and
|
||||
* iteration can be stopped.
|
||||
*
|
||||
* To resume iteration set the next call's starting dir_ino and dir_pos
|
||||
* to one past the last entry seen. Increment the last entry's dir_pos,
|
||||
* and if it wrapped to 0, increment its dir_ino.
|
||||
*
|
||||
* This does not check that the caller has permission to read the
|
||||
* entries found in each containing directory. It requires
|
||||
* CAP_DAC_READ_SEARCH which bypasses path traversal permissions
|
||||
* checking.
|
||||
*
|
||||
* Entries returned by a single call can reflect any combination of
|
||||
* racing creation and removal of entries. Each entry existed at the
|
||||
* time it was read though it may have changed in the time it took to
|
||||
* return from the call. The set of entries returned may no longer
|
||||
* reflect the current set of entries and may not have existed at the
|
||||
* same time.
|
||||
*
|
||||
* This has no knowledge of the life cycle of the inode. It can return
|
||||
* 0 when there are no referring entries because either the target inode
|
||||
* doesn't exist, it is in the process of being deleted, or because it
|
||||
* is still open while being unlinked.
|
||||
*
|
||||
* On success this returns the number of entries filled in the buffer.
|
||||
* A return of 0 indicates that no entries referred to the inode.
|
||||
*
|
||||
* EINVAL is returned when there is a problem with the buffer. Either
|
||||
* it was not aligned or it was not large enough for the first entry.
|
||||
*
|
||||
* Many other errnos indicate hard failure to find the next entry.
|
||||
*/
|
||||
struct scoutfs_ioctl_get_referring_entries {
|
||||
__u64 ino;
|
||||
__u64 dir_ino;
|
||||
__u64 dir_pos;
|
||||
__u64 entries_ptr;
|
||||
__u64 entries_bytes;
|
||||
};
|
||||
|
||||
/*
|
||||
* @dir_ino: The inode of the directory containing the entry.
|
||||
*
|
||||
* @dir_pos: The readdir f_pos position of the entry within the
|
||||
* directory.
|
||||
*
|
||||
* @ino: The inode number of the target of the entry.
|
||||
*
|
||||
* @flags: Flags associated with this entry.
|
||||
*
|
||||
* @d_type: Inode type as specified with DT_ enum values in readdir(3).
|
||||
*
|
||||
* @entry_bytes: The total bytes taken by the entry in memory, including
|
||||
* the name and any alignment padding. The start of a following entry
|
||||
* will be found after this number of bytes.
|
||||
*
|
||||
* @name_len: The number of bytes in the name not including the trailing
|
||||
* null, ala strlen(3).
|
||||
*
|
||||
* @name: The null terminated name of the referring entry. In the
|
||||
* struct definition this array is sized to naturally align the struct.
|
||||
* That number of padded bytes are not necessarily found in the buffer
|
||||
* returned by _get_referring_entries;
|
||||
*/
|
||||
struct scoutfs_ioctl_dirent {
|
||||
__u64 dir_ino;
|
||||
__u64 dir_pos;
|
||||
__u64 ino;
|
||||
__u16 entry_bytes;
|
||||
__u8 flags;
|
||||
__u8 d_type;
|
||||
__u8 name_len;
|
||||
__u8 name[3];
|
||||
};
|
||||
|
||||
#define SCOUTFS_IOCTL_DIRENT_FLAG_LAST (1 << 0)
|
||||
|
||||
#define SCOUTFS_IOC_GET_REFERRING_ENTRIES \
|
||||
_IOW(SCOUTFS_IOCTL_MAGIC, 17, struct scoutfs_ioctl_get_referring_entries)
|
||||
|
||||
#endif
|
||||
|
||||
@@ -36,6 +36,7 @@ enum {
|
||||
Opt_metadev_path,
|
||||
Opt_noacl,
|
||||
Opt_orphan_scan_delay_ms,
|
||||
Opt_quorum_heartbeat_timeout_ms,
|
||||
Opt_quorum_slot_nr,
|
||||
Opt_err,
|
||||
};
|
||||
@@ -47,6 +48,7 @@ static const match_table_t tokens = {
|
||||
{Opt_metadev_path, "metadev_path=%s"},
|
||||
{Opt_noacl, "noacl"},
|
||||
{Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"},
|
||||
{Opt_quorum_heartbeat_timeout_ms, "quorum_heartbeat_timeout_ms=%s"},
|
||||
{Opt_quorum_slot_nr, "quorum_slot_nr=%s"},
|
||||
{Opt_err, NULL}
|
||||
};
|
||||
@@ -124,8 +126,24 @@ static void init_default_options(struct scoutfs_mount_options *opts)
|
||||
|
||||
opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS;
|
||||
opts->data_prealloc_contig_only = 1;
|
||||
opts->quorum_slot_nr = -1;
|
||||
opts->orphan_scan_delay_ms = -1;
|
||||
opts->quorum_heartbeat_timeout_ms = SCOUTFS_QUORUM_DEF_HB_TIMEO_MS;
|
||||
opts->quorum_slot_nr = -1;
|
||||
}
|
||||
|
||||
static int verify_quorum_heartbeat_timeout_ms(struct super_block *sb, int ret, u64 val)
|
||||
{
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "failed to parse quorum_heartbeat_timeout_ms value");
|
||||
return -EINVAL;
|
||||
}
|
||||
if (val < SCOUTFS_QUORUM_MIN_HB_TIMEO_MS || val > SCOUTFS_QUORUM_MAX_HB_TIMEO_MS) {
|
||||
scoutfs_err(sb, "invalid quorum_heartbeat_timeout_ms value %llu, must be between %lu and %lu",
|
||||
val, SCOUTFS_QUORUM_MIN_HB_TIMEO_MS, SCOUTFS_QUORUM_MAX_HB_TIMEO_MS);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -206,6 +224,14 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m
|
||||
opts->orphan_scan_delay_ms = nr;
|
||||
break;
|
||||
|
||||
case Opt_quorum_heartbeat_timeout_ms:
|
||||
ret = match_u64(args, &nr64);
|
||||
ret = verify_quorum_heartbeat_timeout_ms(sb, ret, nr64);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
opts->quorum_heartbeat_timeout_ms = nr64;
|
||||
break;
|
||||
|
||||
case Opt_quorum_slot_nr:
|
||||
if (opts->quorum_slot_nr != -1) {
|
||||
scoutfs_err(sb, "multiple quorum_slot_nr options provided, only provide one.");
|
||||
@@ -448,6 +474,43 @@ static ssize_t orphan_scan_delay_ms_store(struct kobject *kobj, struct kobj_attr
|
||||
}
|
||||
SCOUTFS_ATTR_RW(orphan_scan_delay_ms);
|
||||
|
||||
static ssize_t quorum_heartbeat_timeout_ms_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
struct scoutfs_mount_options opts;
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%llu", opts.quorum_heartbeat_timeout_ms);
|
||||
}
|
||||
static ssize_t quorum_heartbeat_timeout_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
DECLARE_OPTIONS_INFO(sb, optinf);
|
||||
char nullterm[30]; /* more than enough for octal -U64_MAX */
|
||||
u64 val;
|
||||
int len;
|
||||
int ret;
|
||||
|
||||
len = min(count, sizeof(nullterm) - 1);
|
||||
memcpy(nullterm, buf, len);
|
||||
nullterm[len] = '\0';
|
||||
|
||||
ret = kstrtoll(nullterm, 0, &val);
|
||||
ret = verify_quorum_heartbeat_timeout_ms(sb, ret, val);
|
||||
if (ret == 0) {
|
||||
write_seqlock(&optinf->seqlock);
|
||||
optinf->opts.quorum_heartbeat_timeout_ms = val;
|
||||
write_sequnlock(&optinf->seqlock);
|
||||
ret = count;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
SCOUTFS_ATTR_RW(quorum_heartbeat_timeout_ms);
|
||||
|
||||
static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj);
|
||||
@@ -464,6 +527,7 @@ static struct attribute *options_attrs[] = {
|
||||
SCOUTFS_ATTR_PTR(data_prealloc_contig_only),
|
||||
SCOUTFS_ATTR_PTR(metadev_path),
|
||||
SCOUTFS_ATTR_PTR(orphan_scan_delay_ms),
|
||||
SCOUTFS_ATTR_PTR(quorum_heartbeat_timeout_ms),
|
||||
SCOUTFS_ATTR_PTR(quorum_slot_nr),
|
||||
NULL,
|
||||
};
|
||||
|
||||
@@ -11,7 +11,7 @@ struct scoutfs_mount_options {
|
||||
char *metadev_path;
|
||||
unsigned int orphan_scan_delay_ms;
|
||||
int quorum_slot_nr;
|
||||
|
||||
u64 quorum_heartbeat_timeout_ms;
|
||||
};
|
||||
|
||||
void scoutfs_options_read(struct super_block *sb, struct scoutfs_mount_options *opts);
|
||||
|
||||
@@ -100,6 +100,11 @@ struct last_msg {
|
||||
ktime_t ts;
|
||||
};
|
||||
|
||||
struct count_recent {
|
||||
u64 count;
|
||||
ktime_t recent;
|
||||
};
|
||||
|
||||
enum quorum_role { FOLLOWER, CANDIDATE, LEADER };
|
||||
|
||||
struct quorum_status {
|
||||
@@ -112,8 +117,12 @@ struct quorum_status {
|
||||
ktime_t timeout;
|
||||
};
|
||||
|
||||
#define HB_DELAY_NR (SCOUTFS_QUORUM_MAX_HB_TIMEO_MS / MSEC_PER_SEC)
|
||||
|
||||
struct quorum_info {
|
||||
struct super_block *sb;
|
||||
struct scoutfs_quorum_config qconf;
|
||||
struct workqueue_struct *workq;
|
||||
struct work_struct work;
|
||||
struct socket *sock;
|
||||
bool shutdown;
|
||||
@@ -125,6 +134,8 @@ struct quorum_info {
|
||||
struct quorum_status show_status;
|
||||
struct last_msg last_send[SCOUTFS_QUORUM_MAX_SLOTS];
|
||||
struct last_msg last_recv[SCOUTFS_QUORUM_MAX_SLOTS];
|
||||
struct count_recent *hb_delay;
|
||||
unsigned long max_hb_delay;
|
||||
|
||||
struct scoutfs_sysfs_attrs ssa;
|
||||
};
|
||||
@@ -134,11 +145,18 @@ struct quorum_info {
|
||||
#define DECLARE_QUORUM_INFO_KOBJ(kobj, name) \
|
||||
DECLARE_QUORUM_INFO(SCOUTFS_SYSFS_ATTRS_SB(kobj), name)
|
||||
|
||||
static bool quorum_slot_present(struct scoutfs_super_block *super, int i)
|
||||
static bool quorum_slot_present(struct scoutfs_quorum_config *qconf, int i)
|
||||
{
|
||||
BUG_ON(i < 0 || i > SCOUTFS_QUORUM_MAX_SLOTS);
|
||||
|
||||
return super->qconf.slots[i].addr.v4.family == cpu_to_le16(SCOUTFS_AF_IPV4);
|
||||
return qconf->slots[i].addr.v4.family == cpu_to_le16(SCOUTFS_AF_IPV4);
|
||||
}
|
||||
|
||||
static void quorum_slot_sin(struct scoutfs_quorum_config *qconf, int i, struct sockaddr_in *sin)
|
||||
{
|
||||
BUG_ON(i < 0 || i >= SCOUTFS_QUORUM_MAX_SLOTS);
|
||||
|
||||
scoutfs_addr_to_sin(sin, &qconf->slots[i].addr);
|
||||
}
|
||||
|
||||
static ktime_t election_timeout(void)
|
||||
@@ -152,15 +170,14 @@ static ktime_t heartbeat_interval(void)
|
||||
return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_IVAL_MS);
|
||||
}
|
||||
|
||||
static ktime_t heartbeat_timeout(void)
|
||||
static ktime_t heartbeat_timeout(struct scoutfs_mount_options *opts)
|
||||
{
|
||||
return ktime_add_ms(ktime_get(), SCOUTFS_QUORUM_HB_TIMEO_MS);
|
||||
return ktime_add_ms(ktime_get(), opts->quorum_heartbeat_timeout_ms);
|
||||
}
|
||||
|
||||
static int create_socket(struct super_block *sb)
|
||||
{
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct socket *sock = NULL;
|
||||
struct sockaddr_in sin;
|
||||
int addrlen;
|
||||
@@ -172,9 +189,10 @@ static int create_socket(struct super_block *sb)
|
||||
goto out;
|
||||
}
|
||||
|
||||
sock->sk->sk_allocation = GFP_NOFS;
|
||||
/* rather fail and retry than block waiting for free */
|
||||
sock->sk->sk_allocation = GFP_ATOMIC;
|
||||
|
||||
scoutfs_quorum_slot_sin(super, qinf->our_quorum_slot_nr, &sin);
|
||||
quorum_slot_sin(&qinf->qconf, qinf->our_quorum_slot_nr, &sin);
|
||||
|
||||
addrlen = sizeof(sin);
|
||||
ret = kernel_bind(sock, (struct sockaddr *)&sin, addrlen);
|
||||
@@ -201,16 +219,20 @@ static __le32 quorum_message_crc(struct scoutfs_quorum_message *qmes)
|
||||
return cpu_to_le32(crc32c(~0, qmes, len));
|
||||
}
|
||||
|
||||
static void send_msg_members(struct super_block *sb, int type, u64 term,
|
||||
int only)
|
||||
/*
|
||||
* Returns the number of failures from sendmsg.
|
||||
*/
|
||||
static int send_msg_members(struct super_block *sb, int type, u64 term, int only)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
int failed = 0;
|
||||
ktime_t now;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
struct scoutfs_quorum_message qmes = {
|
||||
.fsid = super->hdr.fsid,
|
||||
.fsid = cpu_to_le64(sbi->fsid),
|
||||
.term = cpu_to_le64(term),
|
||||
.type = type,
|
||||
.from = qinf->our_quorum_slot_nr,
|
||||
@@ -232,15 +254,21 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
|
||||
|
||||
qmes.crc = quorum_message_crc(&qmes);
|
||||
|
||||
|
||||
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
if (!quorum_slot_present(super, i) ||
|
||||
if (!quorum_slot_present(&qinf->qconf, i) ||
|
||||
(only >= 0 && i != only) || i == qinf->our_quorum_slot_nr)
|
||||
continue;
|
||||
|
||||
scoutfs_quorum_slot_sin(super, i, &sin);
|
||||
if (scoutfs_forcing_unmount(sb)) {
|
||||
failed = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
scoutfs_quorum_slot_sin(&qinf->qconf, i, &sin);
|
||||
now = ktime_get();
|
||||
kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);
|
||||
ret = kernel_sendmsg(qinf->sock, &mh, &kv, 1, kv.iov_len);
|
||||
if (ret != kv.iov_len)
|
||||
failed++;
|
||||
|
||||
spin_lock(&qinf->show_lock);
|
||||
qinf->last_send[i].msg.term = term;
|
||||
@@ -251,6 +279,8 @@ static void send_msg_members(struct super_block *sb, int type, u64 term,
|
||||
if (i == only)
|
||||
break;
|
||||
}
|
||||
|
||||
return failed;
|
||||
}
|
||||
|
||||
#define send_msg_to(sb, type, term, nr) send_msg_members(sb, type, term, nr)
|
||||
@@ -266,7 +296,7 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
|
||||
ktime_t abs_to)
|
||||
{
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_quorum_message qmes;
|
||||
struct timeval tv;
|
||||
ktime_t rel_to;
|
||||
@@ -305,14 +335,17 @@ static int recv_msg(struct super_block *sb, struct quorum_host_msg *msg,
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (scoutfs_forcing_unmount(sb))
|
||||
return 0;
|
||||
|
||||
now = ktime_get();
|
||||
|
||||
if (ret != sizeof(qmes) ||
|
||||
qmes.crc != quorum_message_crc(&qmes) ||
|
||||
qmes.fsid != super->hdr.fsid ||
|
||||
qmes.fsid != cpu_to_le64(sbi->fsid) ||
|
||||
qmes.type >= SCOUTFS_QUORUM_MSG_INVALID ||
|
||||
qmes.from >= SCOUTFS_QUORUM_MAX_SLOTS ||
|
||||
!quorum_slot_present(super, qmes.from)) {
|
||||
!quorum_slot_present(&qinf->qconf, qmes.from)) {
|
||||
/* should we be trying to open a new socket? */
|
||||
scoutfs_inc_counter(sb, quorum_recv_invalid);
|
||||
return -EAGAIN;
|
||||
@@ -342,7 +375,7 @@ static int read_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_q
|
||||
bool check_rid)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
const u64 fsid = sbi->fsid;
|
||||
const u64 rid = sbi->rid;
|
||||
char msg[150];
|
||||
__le32 crc;
|
||||
@@ -367,9 +400,9 @@ static int read_quorum_block(struct super_block *sb, u64 blkno, struct scoutfs_q
|
||||
else if (le32_to_cpu(blk->hdr.magic) != SCOUTFS_BLOCK_MAGIC_QUORUM)
|
||||
snprintf(msg, sizeof(msg), "blk magic %08x != %08x",
|
||||
le32_to_cpu(blk->hdr.magic), SCOUTFS_BLOCK_MAGIC_QUORUM);
|
||||
else if (blk->hdr.fsid != super->hdr.fsid)
|
||||
else if (blk->hdr.fsid != cpu_to_le64(fsid))
|
||||
snprintf(msg, sizeof(msg), "blk fsid %016llx != %016llx",
|
||||
le64_to_cpu(blk->hdr.fsid), le64_to_cpu(super->hdr.fsid));
|
||||
le64_to_cpu(blk->hdr.fsid), fsid);
|
||||
else if (le64_to_cpu(blk->hdr.blkno) != blkno)
|
||||
snprintf(msg, sizeof(msg), "blk blkno %llu != %llu",
|
||||
le64_to_cpu(blk->hdr.blkno), blkno);
|
||||
@@ -410,8 +443,7 @@ out:
|
||||
*/
|
||||
static void read_greatest_term(struct super_block *sb, u64 *term)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
struct scoutfs_quorum_block blk;
|
||||
int ret;
|
||||
int e;
|
||||
@@ -420,7 +452,7 @@ static void read_greatest_term(struct super_block *sb, u64 *term)
|
||||
*term = 0;
|
||||
|
||||
for (s = 0; s < SCOUTFS_QUORUM_MAX_SLOTS; s++) {
|
||||
if (!quorum_slot_present(super, s))
|
||||
if (!quorum_slot_present(&qinf->qconf, s))
|
||||
continue;
|
||||
|
||||
ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + s, &blk, false);
|
||||
@@ -514,14 +546,15 @@ static int update_quorum_block(struct super_block *sb, int event, u64 term, bool
|
||||
* keeps us from being fenced while we allow userspace fencing to take a
|
||||
* reasonably long time. We still want to timeout eventually.
|
||||
*/
|
||||
int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term)
|
||||
int scoutfs_quorum_fence_leaders(struct super_block *sb, struct scoutfs_quorum_config *qconf,
|
||||
u64 term)
|
||||
{
|
||||
#define NR_OLD 2
|
||||
struct scoutfs_quorum_block_event old[SCOUTFS_QUORUM_MAX_SLOTS][NR_OLD] = {{{0,}}};
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_quorum_block blk;
|
||||
struct sockaddr_in sin;
|
||||
const __le64 lefsid = cpu_to_le64(sbi->fsid);
|
||||
const u64 rid = sbi->rid;
|
||||
bool fence_started = false;
|
||||
u64 fenced = 0;
|
||||
@@ -534,7 +567,7 @@ int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term)
|
||||
BUILD_BUG_ON(SCOUTFS_QUORUM_BLOCKS < SCOUTFS_QUORUM_MAX_SLOTS);
|
||||
|
||||
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
if (!quorum_slot_present(super, i))
|
||||
if (!quorum_slot_present(qconf, i))
|
||||
continue;
|
||||
|
||||
ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk, false);
|
||||
@@ -567,11 +600,11 @@ int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term)
|
||||
continue;
|
||||
|
||||
scoutfs_inc_counter(sb, quorum_fence_leader);
|
||||
scoutfs_quorum_slot_sin(super, i, &sin);
|
||||
quorum_slot_sin(qconf, i, &sin);
|
||||
fence_rid = old[i][j].rid;
|
||||
|
||||
scoutfs_info(sb, "fencing previous leader "SCSBF" at term %llu in slot %u with address "SIN_FMT,
|
||||
SCSB_LEFR_ARGS(super->hdr.fsid, fence_rid),
|
||||
SCSB_LEFR_ARGS(lefsid, fence_rid),
|
||||
le64_to_cpu(old[i][j].term), i, SIN_ARG(&sin));
|
||||
ret = scoutfs_fence_start(sb, le64_to_cpu(fence_rid), sin.sin_addr.s_addr,
|
||||
SCOUTFS_FENCE_QUORUM_BLOCK_LEADER);
|
||||
@@ -592,6 +625,71 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void clear_hb_delay(struct quorum_info *qinf)
|
||||
{
|
||||
int i;
|
||||
|
||||
spin_lock(&qinf->show_lock);
|
||||
qinf->max_hb_delay = 0;
|
||||
for (i = 0; i < HB_DELAY_NR; i++) {
|
||||
qinf->hb_delay[i].recent = ns_to_ktime(0);
|
||||
qinf->hb_delay[i].count = 0;
|
||||
}
|
||||
spin_unlock(&qinf->show_lock);
|
||||
}
|
||||
|
||||
struct hb_recording {
|
||||
ktime_t prev;
|
||||
int count;
|
||||
};
|
||||
|
||||
/*
|
||||
* Record long heartbeat delays. We only record the delay between back
|
||||
* to back send attempts in the leader or back to back recv messages in
|
||||
* the followers. The worker caller sets record_hb when their iteration
|
||||
* sent or received a heartbeat. An iteration that does anything else
|
||||
* resets the tracking.
|
||||
*/
|
||||
static void record_hb_delay(struct super_block *sb, struct quorum_info *qinf,
|
||||
struct hb_recording *hbr, bool record_hb, int role)
|
||||
{
|
||||
bool log = false;
|
||||
ktime_t now;
|
||||
s64 s;
|
||||
|
||||
if (!record_hb) {
|
||||
hbr->count = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
now = ktime_get();
|
||||
|
||||
if (hbr->count < 2 && ++hbr->count < 2) {
|
||||
hbr->prev = now;
|
||||
return;
|
||||
}
|
||||
|
||||
s = ktime_ms_delta(now, hbr->prev) / MSEC_PER_SEC;
|
||||
hbr->prev = now;
|
||||
|
||||
if (s <= 0 || s >= HB_DELAY_NR)
|
||||
return;
|
||||
|
||||
spin_lock(&qinf->show_lock);
|
||||
if (qinf->max_hb_delay < s) {
|
||||
qinf->max_hb_delay = s;
|
||||
if (s >= 3)
|
||||
log = true;
|
||||
}
|
||||
qinf->hb_delay[s].recent = now;
|
||||
qinf->hb_delay[s].count++;
|
||||
spin_unlock(&qinf->show_lock);
|
||||
|
||||
if (log)
|
||||
scoutfs_info(sb, "longest quorum heartbeat %s delay of %lld sec",
|
||||
role == LEADER ? "send" : "recv", s);
|
||||
}
|
||||
|
||||
/*
|
||||
* The main quorum task maintains its private status. It seemed cleaner
|
||||
* to occasionally copy the status for showing in sysfs/debugfs files
|
||||
@@ -616,16 +714,21 @@ static void update_show_status(struct quorum_info *qinf, struct quorum_status *q
|
||||
static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
{
|
||||
struct quorum_info *qinf = container_of(work, struct quorum_info, work);
|
||||
struct scoutfs_mount_options opts;
|
||||
struct super_block *sb = qinf->sb;
|
||||
struct sockaddr_in unused;
|
||||
struct quorum_host_msg msg;
|
||||
struct quorum_status qst = {0,};
|
||||
struct hb_recording hbr = {{0,},};
|
||||
bool record_hb;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
/* recording votes from slots as native single word bitmap */
|
||||
BUILD_BUG_ON(SCOUTFS_QUORUM_MAX_SLOTS > BITS_PER_LONG);
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
/* start out as a follower */
|
||||
qst.role = FOLLOWER;
|
||||
qst.vote_for = -1;
|
||||
@@ -635,7 +738,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
|
||||
/* see if there's a server to chose heartbeat or election timeout */
|
||||
if (scoutfs_quorum_server_sin(sb, &unused) == 0)
|
||||
qst.timeout = heartbeat_timeout();
|
||||
qst.timeout = heartbeat_timeout(&opts);
|
||||
else
|
||||
qst.timeout = election_timeout();
|
||||
|
||||
@@ -659,6 +762,9 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
record_hb = false;
|
||||
|
||||
/* ignore messages from older terms */
|
||||
if (msg.type != SCOUTFS_QUORUM_MSG_INVALID &&
|
||||
msg.term < qst.term)
|
||||
@@ -674,6 +780,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
if (qst.role == LEADER) {
|
||||
scoutfs_warn(sb, "saw msg type %u from %u for term %llu while leader in term %llu, shutting down server.",
|
||||
msg.type, msg.from, msg.term, qst.term);
|
||||
clear_hb_delay(qinf);
|
||||
}
|
||||
qst.role = FOLLOWER;
|
||||
qst.term = msg.term;
|
||||
@@ -682,7 +789,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
scoutfs_inc_counter(sb, quorum_term_follower);
|
||||
|
||||
if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT)
|
||||
qst.timeout = heartbeat_timeout();
|
||||
qst.timeout = heartbeat_timeout(&opts);
|
||||
else
|
||||
qst.timeout = election_timeout();
|
||||
|
||||
@@ -692,6 +799,21 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* receiving heartbeats extends timeout, delaying elections */
|
||||
if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT) {
|
||||
qst.timeout = heartbeat_timeout(&opts);
|
||||
scoutfs_inc_counter(sb, quorum_recv_heartbeat);
|
||||
record_hb = true;
|
||||
}
|
||||
|
||||
/* receiving a resignation from server starts election */
|
||||
if (msg.type == SCOUTFS_QUORUM_MSG_RESIGNATION &&
|
||||
qst.role == FOLLOWER &&
|
||||
msg.term == qst.term) {
|
||||
qst.timeout = election_timeout();
|
||||
scoutfs_inc_counter(sb, quorum_recv_resignation);
|
||||
}
|
||||
|
||||
/* followers and candidates start new election on timeout */
|
||||
if (qst.role != LEADER &&
|
||||
ktime_after(ktime_get(), qst.timeout)) {
|
||||
@@ -744,6 +866,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
qst.timeout = heartbeat_interval();
|
||||
|
||||
update_show_status(qinf, &qst);
|
||||
clear_hb_delay(qinf);
|
||||
|
||||
/* record that we've been elected before starting up server */
|
||||
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_ELECT, qst.term, true);
|
||||
@@ -752,7 +875,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
|
||||
qst.server_start_term = qst.term;
|
||||
qst.server_event = SCOUTFS_QUORUM_EVENT_ELECT;
|
||||
scoutfs_server_start(sb, qst.term);
|
||||
scoutfs_server_start(sb, &qinf->qconf, qst.term);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -798,6 +921,7 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_RESIGNATION,
|
||||
qst.server_start_term);
|
||||
scoutfs_inc_counter(sb, quorum_send_resignation);
|
||||
clear_hb_delay(qinf);
|
||||
}
|
||||
|
||||
ret = update_quorum_block(sb, SCOUTFS_QUORUM_EVENT_STOP,
|
||||
@@ -811,24 +935,16 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
/* leaders regularly send heartbeats to delay elections */
|
||||
if (qst.role == LEADER &&
|
||||
ktime_after(ktime_get(), qst.timeout)) {
|
||||
send_msg_others(sb, SCOUTFS_QUORUM_MSG_HEARTBEAT,
|
||||
qst.term);
|
||||
ret = send_msg_others(sb, SCOUTFS_QUORUM_MSG_HEARTBEAT, qst.term);
|
||||
if (ret > 0) {
|
||||
scoutfs_add_counter(sb, quorum_send_heartbeat_dropped, ret);
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
qst.timeout = heartbeat_interval();
|
||||
scoutfs_inc_counter(sb, quorum_send_heartbeat);
|
||||
}
|
||||
record_hb = true;
|
||||
|
||||
/* receiving heartbeats extends timeout, delaying elections */
|
||||
if (msg.type == SCOUTFS_QUORUM_MSG_HEARTBEAT) {
|
||||
qst.timeout = heartbeat_timeout();
|
||||
scoutfs_inc_counter(sb, quorum_recv_heartbeat);
|
||||
}
|
||||
|
||||
/* receiving a resignation from server starts election */
|
||||
if (msg.type == SCOUTFS_QUORUM_MSG_RESIGNATION &&
|
||||
qst.role == FOLLOWER &&
|
||||
msg.term == qst.term) {
|
||||
qst.timeout = election_timeout();
|
||||
scoutfs_inc_counter(sb, quorum_recv_resignation);
|
||||
}
|
||||
|
||||
/* followers vote once per term */
|
||||
@@ -840,6 +956,8 @@ static void scoutfs_quorum_worker(struct work_struct *work)
|
||||
msg.from);
|
||||
scoutfs_inc_counter(sb, quorum_send_vote);
|
||||
}
|
||||
|
||||
record_hb_delay(sb, qinf, &hbr, record_hb, qst.role);
|
||||
}
|
||||
|
||||
update_show_status(qinf, &qst);
|
||||
@@ -877,16 +995,25 @@ out:
|
||||
*/
|
||||
int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_super_block *super = NULL;
|
||||
struct scoutfs_quorum_block blk;
|
||||
u64 elect_term;
|
||||
u64 term = 0;
|
||||
int ret = 0;
|
||||
int i;
|
||||
|
||||
super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
|
||||
if (!super) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_read_super(sb, super);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
if (!quorum_slot_present(super, i))
|
||||
if (!quorum_slot_present(&super->qconf, i))
|
||||
continue;
|
||||
|
||||
ret = read_quorum_block(sb, SCOUTFS_QUORUM_BLKNO + i, &blk, false);
|
||||
@@ -900,7 +1027,7 @@ int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin)
|
||||
if (elect_term > term &&
|
||||
elect_term > le64_to_cpu(blk.events[SCOUTFS_QUORUM_EVENT_STOP].term)) {
|
||||
term = elect_term;
|
||||
scoutfs_quorum_slot_sin(super, i, sin);
|
||||
scoutfs_quorum_slot_sin(&super->qconf, i, sin);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -909,6 +1036,7 @@ int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin)
|
||||
ret = -ENOENT;
|
||||
|
||||
out:
|
||||
kfree(super);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -924,12 +1052,9 @@ u8 scoutfs_quorum_votes_needed(struct super_block *sb)
|
||||
return qinf->votes_needed;
|
||||
}
|
||||
|
||||
void scoutfs_quorum_slot_sin(struct scoutfs_super_block *super, int i,
|
||||
struct sockaddr_in *sin)
|
||||
void scoutfs_quorum_slot_sin(struct scoutfs_quorum_config *qconf, int i, struct sockaddr_in *sin)
|
||||
{
|
||||
BUG_ON(i < 0 || i >= SCOUTFS_QUORUM_MAX_SLOTS);
|
||||
|
||||
scoutfs_addr_to_sin(sin, &super->qconf.slots[i].addr);
|
||||
return quorum_slot_sin(qconf, i, sin);
|
||||
}
|
||||
|
||||
static char *role_str(int role)
|
||||
@@ -969,9 +1094,11 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
{
|
||||
DECLARE_QUORUM_INFO_KOBJ(kobj, qinf);
|
||||
struct quorum_status qst;
|
||||
struct count_recent cr;
|
||||
struct last_msg last;
|
||||
struct timespec64 ts;
|
||||
const ktime_t now = ktime_get();
|
||||
unsigned long ul;
|
||||
size_t size;
|
||||
int ret;
|
||||
int i;
|
||||
@@ -1029,6 +1156,26 @@ static ssize_t status_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
(s64)ts.tv_sec, (int)ts.tv_nsec);
|
||||
}
|
||||
|
||||
spin_lock(&qinf->show_lock);
|
||||
ul = qinf->max_hb_delay;
|
||||
spin_unlock(&qinf->show_lock);
|
||||
if (ul)
|
||||
snprintf_ret(buf, size, &ret, "HB Delay(s) Count Secs Since\n");
|
||||
|
||||
for (i = 1; i <= ul && i < HB_DELAY_NR; i++) {
|
||||
spin_lock(&qinf->show_lock);
|
||||
cr = qinf->hb_delay[i];
|
||||
spin_unlock(&qinf->show_lock);
|
||||
|
||||
if (cr.count == 0)
|
||||
continue;
|
||||
|
||||
ts = ktime_to_timespec64(ktime_sub(now, cr.recent));
|
||||
snprintf_ret(buf, size, &ret,
|
||||
"%11u %9llu %lld.%09u\n",
|
||||
i, cr.count, (s64)ts.tv_sec, (int)ts.tv_nsec);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
SCOUTFS_ATTR_RO(status);
|
||||
@@ -1060,11 +1207,10 @@ static inline bool valid_ipv4_port(__be16 port)
|
||||
return port != 0 && be16_to_cpu(port) != U16_MAX;
|
||||
}
|
||||
|
||||
static int verify_quorum_slots(struct super_block *sb)
|
||||
static int verify_quorum_slots(struct super_block *sb, struct quorum_info *qinf,
|
||||
struct scoutfs_quorum_config *qconf)
|
||||
{
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
char slots[(SCOUTFS_QUORUM_MAX_SLOTS * 3) + 1];
|
||||
DECLARE_QUORUM_INFO(sb, qinf);
|
||||
struct sockaddr_in other;
|
||||
struct sockaddr_in sin;
|
||||
int found = 0;
|
||||
@@ -1074,10 +1220,10 @@ static int verify_quorum_slots(struct super_block *sb)
|
||||
|
||||
|
||||
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
if (!quorum_slot_present(super, i))
|
||||
if (!quorum_slot_present(qconf, i))
|
||||
continue;
|
||||
|
||||
scoutfs_quorum_slot_sin(super, i, &sin);
|
||||
scoutfs_quorum_slot_sin(qconf, i, &sin);
|
||||
|
||||
if (!valid_ipv4_unicast(sin.sin_addr.s_addr)) {
|
||||
scoutfs_err(sb, "quorum slot #%d has invalid ipv4 unicast address: "SIN_FMT,
|
||||
@@ -1092,10 +1238,10 @@ static int verify_quorum_slots(struct super_block *sb)
|
||||
}
|
||||
|
||||
for (j = i + 1; j < SCOUTFS_QUORUM_MAX_SLOTS; j++) {
|
||||
if (!quorum_slot_present(super, j))
|
||||
if (!quorum_slot_present(qconf, j))
|
||||
continue;
|
||||
|
||||
scoutfs_quorum_slot_sin(super, j, &other);
|
||||
scoutfs_quorum_slot_sin(qconf, j, &other);
|
||||
|
||||
if (sin.sin_addr.s_addr == other.sin_addr.s_addr &&
|
||||
sin.sin_port == other.sin_port) {
|
||||
@@ -1113,11 +1259,11 @@ static int verify_quorum_slots(struct super_block *sb)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!quorum_slot_present(super, qinf->our_quorum_slot_nr)) {
|
||||
if (!quorum_slot_present(qconf, qinf->our_quorum_slot_nr)) {
|
||||
char *str = slots;
|
||||
*str = '\0';
|
||||
for (i = 0; i < SCOUTFS_QUORUM_MAX_SLOTS; i++) {
|
||||
if (quorum_slot_present(super, i)) {
|
||||
if (quorum_slot_present(qconf, i)) {
|
||||
ret = snprintf(str, &slots[ARRAY_SIZE(slots)] - str, "%c%u",
|
||||
str == slots ? ' ' : ',', i);
|
||||
if (ret < 2 || ret > 3) {
|
||||
@@ -1141,16 +1287,22 @@ static int verify_quorum_slots(struct super_block *sb)
|
||||
else
|
||||
qinf->votes_needed = (found / 2) + 1;
|
||||
|
||||
qinf->qconf = *qconf;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Once this schedules the quorum worker it can be elected leader and
|
||||
* start the server, possibly before this returns.
|
||||
* start the server, possibly before this returns. The quorum agent
|
||||
* would be responsible for tracking the quorum config in the super
|
||||
* block if it changes. Until then uses a static config that it reads
|
||||
* during setup.
|
||||
*/
|
||||
int scoutfs_quorum_setup(struct super_block *sb)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = NULL;
|
||||
struct scoutfs_mount_options opts;
|
||||
struct quorum_info *qinf;
|
||||
int ret;
|
||||
@@ -1160,7 +1312,14 @@ int scoutfs_quorum_setup(struct super_block *sb)
|
||||
return 0;
|
||||
|
||||
qinf = kzalloc(sizeof(struct quorum_info), GFP_KERNEL);
|
||||
if (!qinf) {
|
||||
super = kmalloc(sizeof(struct scoutfs_super_block), GFP_KERNEL);
|
||||
if (qinf)
|
||||
qinf->hb_delay = __vmalloc(HB_DELAY_NR * sizeof(struct count_recent),
|
||||
GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
|
||||
if (!qinf || !super || !qinf->hb_delay) {
|
||||
if (qinf)
|
||||
vfree(qinf->hb_delay);
|
||||
kfree(qinf);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
@@ -1174,7 +1333,20 @@ int scoutfs_quorum_setup(struct super_block *sb)
|
||||
sbi->quorum_info = qinf;
|
||||
qinf->sb = sb;
|
||||
|
||||
ret = verify_quorum_slots(sb);
|
||||
/* a high priority single threaded context without mem reclaim */
|
||||
qinf->workq = alloc_workqueue("scoutfs_quorum_work",
|
||||
WQ_NON_REENTRANT | WQ_UNBOUND |
|
||||
WQ_HIGHPRI, 1);
|
||||
if (!qinf->workq) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_read_super(sb, super);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = verify_quorum_slots(sb, qinf, &super->qconf);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
@@ -1188,12 +1360,13 @@ int scoutfs_quorum_setup(struct super_block *sb)
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
schedule_work(&qinf->work);
|
||||
queue_work(qinf->workq, &qinf->work);
|
||||
|
||||
out:
|
||||
if (ret)
|
||||
scoutfs_quorum_destroy(sb);
|
||||
|
||||
kfree(super);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -1217,10 +1390,14 @@ void scoutfs_quorum_destroy(struct super_block *sb)
|
||||
qinf->shutdown = true;
|
||||
flush_work(&qinf->work);
|
||||
|
||||
if (qinf->workq)
|
||||
destroy_workqueue(qinf->workq);
|
||||
|
||||
scoutfs_sysfs_destroy_attrs(sb, &qinf->ssa);
|
||||
if (qinf->sock)
|
||||
sock_release(qinf->sock);
|
||||
|
||||
vfree(qinf->hb_delay);
|
||||
kfree(qinf);
|
||||
sbi->quorum_info = NULL;
|
||||
}
|
||||
|
||||
@@ -4,10 +4,11 @@
|
||||
int scoutfs_quorum_server_sin(struct super_block *sb, struct sockaddr_in *sin);
|
||||
|
||||
u8 scoutfs_quorum_votes_needed(struct super_block *sb);
|
||||
void scoutfs_quorum_slot_sin(struct scoutfs_super_block *super, int i,
|
||||
void scoutfs_quorum_slot_sin(struct scoutfs_quorum_config *qconf, int i,
|
||||
struct sockaddr_in *sin);
|
||||
|
||||
int scoutfs_quorum_fence_leaders(struct super_block *sb, u64 term);
|
||||
int scoutfs_quorum_fence_leaders(struct super_block *sb, struct scoutfs_quorum_config *qconf,
|
||||
u64 term);
|
||||
|
||||
int scoutfs_quorum_setup(struct super_block *sb);
|
||||
void scoutfs_quorum_shutdown(struct super_block *sb);
|
||||
|
||||
@@ -817,22 +817,17 @@ TRACE_EVENT(scoutfs_advance_dirty_super,
|
||||
TP_printk(SCSBF" super seq now %llu", SCSB_TRACE_ARGS, __entry->seq)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_dir_add_next_linkref,
|
||||
TRACE_EVENT(scoutfs_dir_add_next_linkref_found,
|
||||
TP_PROTO(struct super_block *sb, __u64 ino, __u64 dir_ino,
|
||||
__u64 dir_pos, int ret, __u64 found_dir_ino,
|
||||
__u64 found_dir_pos, unsigned int name_len),
|
||||
__u64 dir_pos, unsigned int name_len),
|
||||
|
||||
TP_ARGS(sb, ino, dir_ino, dir_pos, ret, found_dir_pos, found_dir_ino,
|
||||
name_len),
|
||||
TP_ARGS(sb, ino, dir_ino, dir_pos, name_len),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, ino)
|
||||
__field(__u64, dir_ino)
|
||||
__field(__u64, dir_pos)
|
||||
__field(int, ret)
|
||||
__field(__u64, found_dir_ino)
|
||||
__field(__u64, found_dir_pos)
|
||||
__field(unsigned int, name_len)
|
||||
),
|
||||
|
||||
@@ -841,16 +836,43 @@ TRACE_EVENT(scoutfs_dir_add_next_linkref,
|
||||
__entry->ino = ino;
|
||||
__entry->dir_ino = dir_ino;
|
||||
__entry->dir_pos = dir_pos;
|
||||
__entry->ret = ret;
|
||||
__entry->found_dir_ino = dir_ino;
|
||||
__entry->found_dir_pos = dir_pos;
|
||||
__entry->name_len = name_len;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" ino %llu dir_ino %llu dir_pos %llu ret %d found_dir_ino %llu found_dir_pos %llu name_len %u",
|
||||
SCSB_TRACE_ARGS, __entry->ino, __entry->dir_pos,
|
||||
__entry->dir_ino, __entry->ret, __entry->found_dir_pos,
|
||||
__entry->found_dir_ino, __entry->name_len)
|
||||
TP_printk(SCSBF" ino %llu dir_ino %llu dir_pos %llu name_len %u",
|
||||
SCSB_TRACE_ARGS, __entry->ino, __entry->dir_ino,
|
||||
__entry->dir_pos, __entry->name_len)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_dir_add_next_linkrefs,
|
||||
TP_PROTO(struct super_block *sb, __u64 ino, __u64 dir_ino,
|
||||
__u64 dir_pos, int count, int nr, int ret),
|
||||
|
||||
TP_ARGS(sb, ino, dir_ino, dir_pos, count, nr, ret),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(__u64, ino)
|
||||
__field(__u64, dir_ino)
|
||||
__field(__u64, dir_pos)
|
||||
__field(int, count)
|
||||
__field(int, nr)
|
||||
__field(int, ret)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
SCSB_TRACE_ASSIGN(sb);
|
||||
__entry->ino = ino;
|
||||
__entry->dir_ino = dir_ino;
|
||||
__entry->dir_pos = dir_pos;
|
||||
__entry->count = count;
|
||||
__entry->nr = nr;
|
||||
__entry->ret = ret;
|
||||
),
|
||||
|
||||
TP_printk(SCSBF" ino %llu dir_ino %llu dir_pos %llu count %d nr %d ret %d",
|
||||
SCSB_TRACE_ARGS, __entry->ino, __entry->dir_ino,
|
||||
__entry->dir_pos, __entry->count, __entry->nr, __entry->ret)
|
||||
);
|
||||
|
||||
TRACE_EVENT(scoutfs_write_begin,
|
||||
@@ -1874,8 +1896,9 @@ DEFINE_EVENT(scoutfs_server_client_count_class, scoutfs_server_client_down,
|
||||
|
||||
DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded),
|
||||
u32 avail_before, u32 freed_before, int committing, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing,
|
||||
exceeded),
|
||||
TP_STRUCT__entry(
|
||||
SCSB_TRACE_FIELDS
|
||||
__field(int, holding)
|
||||
@@ -1883,6 +1906,7 @@ DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
|
||||
__field(int, nr_holders)
|
||||
__field(__u32, avail_before)
|
||||
__field(__u32, freed_before)
|
||||
__field(int, committing)
|
||||
__field(int, exceeded)
|
||||
),
|
||||
TP_fast_assign(
|
||||
@@ -1892,31 +1916,33 @@ DECLARE_EVENT_CLASS(scoutfs_server_commit_users_class,
|
||||
__entry->nr_holders = nr_holders;
|
||||
__entry->avail_before = avail_before;
|
||||
__entry->freed_before = freed_before;
|
||||
__entry->committing = !!committing;
|
||||
__entry->exceeded = !!exceeded;
|
||||
),
|
||||
TP_printk(SCSBF" holding %u applying %u nr %u avail_before %u freed_before %u exceeded %u",
|
||||
TP_printk(SCSBF" holding %u applying %u nr %u avail_before %u freed_before %u committing %u exceeded %u",
|
||||
SCSB_TRACE_ARGS, __entry->holding, __entry->applying, __entry->nr_holders,
|
||||
__entry->avail_before, __entry->freed_before, __entry->exceeded)
|
||||
__entry->avail_before, __entry->freed_before, __entry->committing,
|
||||
__entry->exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_hold,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
u32 avail_before, u32 freed_before, int committing, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_apply,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
u32 avail_before, u32 freed_before, int committing, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_start,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
u32 avail_before, u32 freed_before, int committing, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
|
||||
);
|
||||
DEFINE_EVENT(scoutfs_server_commit_users_class, scoutfs_server_commit_end,
|
||||
TP_PROTO(struct super_block *sb, int holding, int applying, int nr_holders,
|
||||
u32 avail_before, u32 freed_before, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, exceeded)
|
||||
u32 avail_before, u32 freed_before, int committing, int exceeded),
|
||||
TP_ARGS(sb, holding, applying, nr_holders, avail_before, freed_before, committing, exceeded)
|
||||
);
|
||||
|
||||
#define slt_symbolic(mode) \
|
||||
|
||||
@@ -67,6 +67,7 @@ struct commit_users {
|
||||
unsigned int nr_holders;
|
||||
u32 avail_before;
|
||||
u32 freed_before;
|
||||
bool committing;
|
||||
bool exceeded;
|
||||
};
|
||||
|
||||
@@ -84,7 +85,7 @@ do { \
|
||||
__typeof__(cusers) _cusers = (cusers); \
|
||||
trace_scoutfs_server_commit_##which(sb, !list_empty(&_cusers->holding), \
|
||||
!list_empty(&_cusers->applying), _cusers->nr_holders, _cusers->avail_before, \
|
||||
_cusers->freed_before, _cusers->exceeded); \
|
||||
_cusers->freed_before, _cusers->committing, _cusers->exceeded); \
|
||||
} while (0)
|
||||
|
||||
struct server_info {
|
||||
@@ -130,9 +131,9 @@ struct server_info {
|
||||
struct mutex srch_mutex;
|
||||
struct mutex mounted_clients_mutex;
|
||||
|
||||
/* stable versions stored from commits, given in locks and rpcs */
|
||||
seqcount_t roots_seqcount;
|
||||
struct scoutfs_net_roots roots;
|
||||
/* stable super stored from commits, given in locks and rpcs */
|
||||
seqcount_t stable_seqcount;
|
||||
struct scoutfs_super_block stable_super;
|
||||
|
||||
/* serializing and get and set volume options */
|
||||
seqcount_t volopt_seqcount;
|
||||
@@ -143,11 +144,18 @@ struct server_info {
|
||||
struct work_struct fence_pending_recov_work;
|
||||
/* while running we check for fenced mounts to reclaim */
|
||||
struct delayed_work reclaim_dwork;
|
||||
|
||||
/* a running server gets a static quorum config from quorum as it starts */
|
||||
struct scoutfs_quorum_config qconf;
|
||||
/* a running server maintains a private dirty super */
|
||||
struct scoutfs_super_block dirty_super;
|
||||
};
|
||||
|
||||
#define DECLARE_SERVER_INFO(sb, name) \
|
||||
struct server_info *name = SCOUTFS_SB(sb)->server_info
|
||||
|
||||
#define DIRTY_SUPER_SB(sb) (&SCOUTFS_SB(sb)->server_info->dirty_super)
|
||||
|
||||
/*
|
||||
* The server tracks each connected client.
|
||||
*/
|
||||
@@ -275,6 +283,14 @@ struct commit_hold {
|
||||
* per-holder allocation consumption tracking. The best we can do is
|
||||
* flag all the current holders so that as they release we can see
|
||||
* everyone involved in crossing the limit.
|
||||
*
|
||||
* The consumption of space to record freed blocks is tricky. The
|
||||
* freed_before value was the space available as the holder started.
|
||||
* But that happens before we actually dirty the first block in the
|
||||
* freed list. If that block is too full then we just allocate a new
|
||||
* empty first block. In that case the current remaining here can be a
|
||||
* lot more than the initial freed_before. We account for that and
|
||||
* treat freed_before as the maximum capacity.
|
||||
*/
|
||||
static void check_holder_budget(struct super_block *sb, struct server_info *server,
|
||||
struct commit_users *cusers)
|
||||
@@ -294,8 +310,13 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
|
||||
return;
|
||||
|
||||
scoutfs_alloc_meta_remaining(&server->alloc, &avail_now, &freed_now);
|
||||
|
||||
avail_used = cusers->avail_before - avail_now;
|
||||
freed_used = cusers->freed_before - freed_now;
|
||||
if (freed_now < cusers->freed_before)
|
||||
freed_used = cusers->freed_before - freed_now;
|
||||
else
|
||||
freed_used = SCOUTFS_ALLOC_LIST_MAX_BLOCKS - freed_now;
|
||||
|
||||
budget = cusers->nr_holders * COMMIT_HOLD_ALLOC_BUDGET;
|
||||
if (avail_used <= budget && freed_used <= budget)
|
||||
return;
|
||||
@@ -318,31 +339,18 @@ static void check_holder_budget(struct super_block *sb, struct server_info *serv
|
||||
/*
|
||||
* We don't have per-holder consumption. We allow commit holders as
|
||||
* long as the total budget of all the holders doesn't exceed the alloc
|
||||
* resources that were available
|
||||
* resources that were available. If a hold is waiting for budget
|
||||
* availability in the allocators then we try and kick off a commit to
|
||||
* fill and use the next allocators after the current transaction.
|
||||
*/
|
||||
static bool commit_alloc_has_room(struct server_info *server, struct commit_users *cusers,
|
||||
unsigned int more_holders)
|
||||
{
|
||||
u32 avail_before;
|
||||
u32 freed_before;
|
||||
u32 budget;
|
||||
|
||||
if (cusers->nr_holders > 0) {
|
||||
avail_before = cusers->avail_before;
|
||||
freed_before = cusers->freed_before;
|
||||
} else {
|
||||
scoutfs_alloc_meta_remaining(&server->alloc, &avail_before, &freed_before);
|
||||
}
|
||||
|
||||
budget = (cusers->nr_holders + more_holders) * COMMIT_HOLD_ALLOC_BUDGET;
|
||||
|
||||
return avail_before >= budget && freed_before >= budget;
|
||||
}
|
||||
|
||||
static bool hold_commit(struct super_block *sb, struct server_info *server,
|
||||
struct commit_users *cusers, struct commit_hold *hold)
|
||||
{
|
||||
bool held = false;
|
||||
bool has_room;
|
||||
bool held;
|
||||
u32 budget;
|
||||
u32 av;
|
||||
u32 fr;
|
||||
|
||||
spin_lock(&cusers->lock);
|
||||
|
||||
@@ -350,19 +358,39 @@ static bool hold_commit(struct super_block *sb, struct server_info *server,
|
||||
|
||||
check_holder_budget(sb, server, cusers);
|
||||
|
||||
if (cusers->nr_holders == 0) {
|
||||
scoutfs_alloc_meta_remaining(&server->alloc, &av, &fr);
|
||||
} else {
|
||||
av = cusers->avail_before;
|
||||
fr = cusers->freed_before;
|
||||
}
|
||||
|
||||
/* +2 for our additional hold and then for the final commit work the server does */
|
||||
if (list_empty(&cusers->applying) && commit_alloc_has_room(server, cusers, 2)) {
|
||||
scoutfs_alloc_meta_remaining(&server->alloc, &hold->avail, &hold->freed);
|
||||
budget = (cusers->nr_holders + 2) * COMMIT_HOLD_ALLOC_BUDGET;
|
||||
has_room = av >= budget && fr >= budget;
|
||||
/* checking applying so holders drain once an apply caller starts waiting */
|
||||
held = !cusers->committing && has_room && list_empty(&cusers->applying);
|
||||
|
||||
if (held) {
|
||||
if (cusers->nr_holders == 0) {
|
||||
cusers->avail_before = hold->avail;
|
||||
cusers->freed_before = hold->freed;
|
||||
cusers->avail_before = av;
|
||||
cusers->freed_before = fr;
|
||||
hold->avail = av;
|
||||
hold->freed = fr;
|
||||
cusers->exceeded = false;
|
||||
} else {
|
||||
scoutfs_alloc_meta_remaining(&server->alloc, &hold->avail, &hold->freed);
|
||||
}
|
||||
|
||||
hold->exceeded = false;
|
||||
hold->start = ktime_get();
|
||||
list_add_tail(&hold->entry, &cusers->holding);
|
||||
|
||||
cusers->nr_holders++;
|
||||
held = true;
|
||||
|
||||
} else if (!has_room && cusers->nr_holders == 0 && !cusers->committing) {
|
||||
cusers->committing = true;
|
||||
queue_work(server->wq, &server->commit_work);
|
||||
}
|
||||
|
||||
spin_unlock(&cusers->lock);
|
||||
@@ -396,7 +424,6 @@ static int server_apply_commit(struct super_block *sb, struct commit_hold *hold,
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct commit_users *cusers = &server->cusers;
|
||||
struct timespec ts;
|
||||
bool start_commit;
|
||||
|
||||
spin_lock(&cusers->lock);
|
||||
|
||||
@@ -417,12 +444,14 @@ static int server_apply_commit(struct super_block *sb, struct commit_hold *hold,
|
||||
list_del_init(&hold->entry);
|
||||
hold->ret = err;
|
||||
}
|
||||
cusers->nr_holders--;
|
||||
start_commit = cusers->nr_holders == 0 && !list_empty(&cusers->applying);
|
||||
spin_unlock(&cusers->lock);
|
||||
|
||||
if (start_commit)
|
||||
cusers->nr_holders--;
|
||||
if (cusers->nr_holders == 0 && !cusers->committing && !list_empty(&cusers->applying)) {
|
||||
cusers->committing = true;
|
||||
queue_work(server->wq, &server->commit_work);
|
||||
}
|
||||
|
||||
spin_unlock(&cusers->lock);
|
||||
|
||||
wait_event(cusers->waitq, list_empty_careful(&hold->entry));
|
||||
smp_rmb(); /* entry load before ret */
|
||||
@@ -431,8 +460,8 @@ static int server_apply_commit(struct super_block *sb, struct commit_hold *hold,
|
||||
|
||||
/*
|
||||
* Start a commit from the commit work. We should only have been queued
|
||||
* while a holder is waiting to apply after all active holders have
|
||||
* finished.
|
||||
* while there are no active holders and someone started the commit.
|
||||
* There may or may not be blocked apply callers waiting for the result.
|
||||
*/
|
||||
static int commit_start(struct super_block *sb, struct commit_users *cusers)
|
||||
{
|
||||
@@ -441,7 +470,7 @@ static int commit_start(struct super_block *sb, struct commit_users *cusers)
|
||||
/* make sure holders held off once commit started */
|
||||
spin_lock(&cusers->lock);
|
||||
TRACE_COMMIT_USERS(sb, cusers, start);
|
||||
if (WARN_ON_ONCE(list_empty(&cusers->applying) || cusers->nr_holders != 0))
|
||||
if (WARN_ON_ONCE(!cusers->committing || cusers->nr_holders != 0))
|
||||
ret = -EINVAL;
|
||||
spin_unlock(&cusers->lock);
|
||||
|
||||
@@ -464,21 +493,28 @@ static void commit_end(struct super_block *sb, struct commit_users *cusers, int
|
||||
smp_wmb(); /* ret stores before list updates */
|
||||
list_for_each_entry_safe(hold, tmp, &cusers->applying, entry)
|
||||
list_del_init(&hold->entry);
|
||||
cusers->committing = false;
|
||||
spin_unlock(&cusers->lock);
|
||||
|
||||
wake_up(&cusers->waitq);
|
||||
}
|
||||
|
||||
static void get_roots(struct super_block *sb,
|
||||
struct scoutfs_net_roots *roots)
|
||||
static void get_stable(struct super_block *sb, struct scoutfs_super_block *super,
|
||||
struct scoutfs_net_roots *roots)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
unsigned int seq;
|
||||
|
||||
do {
|
||||
seq = read_seqcount_begin(&server->roots_seqcount);
|
||||
*roots = server->roots;
|
||||
} while (read_seqcount_retry(&server->roots_seqcount, seq));
|
||||
seq = read_seqcount_begin(&server->stable_seqcount);
|
||||
if (super)
|
||||
*super = server->stable_super;
|
||||
if (roots) {
|
||||
roots->fs_root = server->stable_super.fs_root;
|
||||
roots->logs_root = server->stable_super.logs_root;
|
||||
roots->srch_root = server->stable_super.srch_root;
|
||||
}
|
||||
} while (read_seqcount_retry(&server->stable_seqcount, seq));
|
||||
}
|
||||
|
||||
u64 scoutfs_server_seq(struct super_block *sb)
|
||||
@@ -510,17 +546,12 @@ void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq)
|
||||
}
|
||||
}
|
||||
|
||||
static void set_roots(struct server_info *server,
|
||||
struct scoutfs_btree_root *fs_root,
|
||||
struct scoutfs_btree_root *logs_root,
|
||||
struct scoutfs_btree_root *srch_root)
|
||||
static void set_stable_super(struct server_info *server, struct scoutfs_super_block *super)
|
||||
{
|
||||
preempt_disable();
|
||||
write_seqcount_begin(&server->roots_seqcount);
|
||||
server->roots.fs_root = *fs_root;
|
||||
server->roots.logs_root = *logs_root;
|
||||
server->roots.srch_root = *srch_root;
|
||||
write_seqcount_end(&server->roots_seqcount);
|
||||
write_seqcount_begin(&server->stable_seqcount);
|
||||
server->stable_super = *super;
|
||||
write_seqcount_end(&server->stable_seqcount);
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
@@ -535,7 +566,7 @@ static void set_roots(struct server_info *server,
|
||||
* implement commits with a single pending work func.
|
||||
*
|
||||
* Processing paths hold the commit while they're making multiple
|
||||
* dependent changes. When they're done and want it persistent they add
|
||||
* dependent changes. When they're done and want it persistent they
|
||||
* queue the commit work. This work runs, performs the commit, and
|
||||
* wakes all the applying waiters with the result. Readers can run
|
||||
* concurrently with these commits.
|
||||
@@ -545,7 +576,7 @@ static void scoutfs_server_commit_func(struct work_struct *work)
|
||||
struct server_info *server = container_of(work, struct server_info,
|
||||
commit_work);
|
||||
struct super_block *sb = server->sb;
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct commit_users *cusers = &server->cusers;
|
||||
int ret;
|
||||
|
||||
@@ -603,8 +634,7 @@ static void scoutfs_server_commit_func(struct work_struct *work)
|
||||
goto out;
|
||||
}
|
||||
|
||||
set_roots(server, &super->fs_root, &super->logs_root,
|
||||
&super->srch_root);
|
||||
set_stable_super(server, super);
|
||||
|
||||
/* swizzle the active and idle server alloc/freed heads */
|
||||
server->other_ind ^= 1;
|
||||
@@ -641,7 +671,7 @@ static int server_alloc_inodes(struct super_block *sb,
|
||||
u8 cmd, u64 id, void *arg, u16 arg_len)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_net_inode_alloc ial = { 0, };
|
||||
COMMIT_HOLD(hold);
|
||||
__le64 lecount;
|
||||
@@ -809,7 +839,7 @@ static void mod_bitmap_bits(__le64 *dst, u64 dst_zone_blocks,
|
||||
static int get_data_alloc_zone_bits(struct super_block *sb, u64 rid, __le64 *exclusive,
|
||||
__le64 *vacant, u64 zone_blocks)
|
||||
{
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
struct scoutfs_log_trees *lt;
|
||||
struct scoutfs_key key;
|
||||
@@ -1040,7 +1070,7 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
u64 rid, struct commit_hold *hold)
|
||||
{
|
||||
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_log_merge_status stat;
|
||||
struct scoutfs_log_merge_range rng;
|
||||
struct scoutfs_log_trees each_lt;
|
||||
@@ -1242,7 +1272,7 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
const u64 rid = le64_to_cpu(lt->rid);
|
||||
const u64 nr = le64_to_cpu(lt->nr);
|
||||
struct scoutfs_log_trees drain;
|
||||
@@ -1329,7 +1359,7 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id, void *arg, u16 arg_len)
|
||||
{
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
u64 rid = scoutfs_net_client_rid(conn);
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
__le64 exclusive[SCOUTFS_DATA_ALLOC_ZONE_LE64S];
|
||||
@@ -1524,7 +1554,7 @@ static int server_commit_log_trees(struct super_block *sb,
|
||||
struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id, void *arg, u16 arg_len)
|
||||
{
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
const u64 rid = scoutfs_net_client_rid(conn);
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
@@ -1579,6 +1609,13 @@ static int server_commit_log_trees(struct super_block *sb,
|
||||
if (ret < 0 || committed)
|
||||
goto unlock;
|
||||
|
||||
/* make sure _update succeeds before we modify srch items */
|
||||
ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri, &super->logs_root, &key);
|
||||
if (ret < 0) {
|
||||
err_str = "dirtying lt item";
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* try to rotate the srch log when big enough */
|
||||
mutex_lock(&server->srch_mutex);
|
||||
ret = scoutfs_srch_rotate_log(sb, &server->alloc, &server->wri,
|
||||
@@ -1593,6 +1630,7 @@ static int server_commit_log_trees(struct super_block *sb,
|
||||
|
||||
ret = scoutfs_btree_update(sb, &server->alloc, &server->wri,
|
||||
&super->logs_root, &key, <, sizeof(lt));
|
||||
BUG_ON(ret < 0); /* dirtying should have guaranteed success */
|
||||
if (ret < 0)
|
||||
err_str = "updating log trees item";
|
||||
|
||||
@@ -1624,7 +1662,7 @@ static int server_get_roots(struct super_block *sb,
|
||||
memset(&roots, 0, sizeof(roots));
|
||||
ret = -EINVAL;
|
||||
} else {
|
||||
get_roots(sb, &roots);
|
||||
get_stable(sb, NULL, &roots);
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
@@ -1654,7 +1692,7 @@ static int server_get_roots(struct super_block *sb,
|
||||
*/
|
||||
static int reclaim_open_log_tree(struct super_block *sb, u64 rid)
|
||||
{
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
struct scoutfs_log_trees lt;
|
||||
@@ -1751,9 +1789,8 @@ out:
|
||||
*/
|
||||
static int get_stable_trans_seq(struct super_block *sb, u64 *last_seq_ret)
|
||||
{
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
struct scoutfs_log_trees *lt;
|
||||
struct scoutfs_key key;
|
||||
@@ -1909,9 +1946,8 @@ static int server_srch_get_compact(struct super_block *sb,
|
||||
u8 cmd, u64 id, void *arg, u16 arg_len)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
u64 rid = scoutfs_net_client_rid(conn);
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_srch_compact *sc = NULL;
|
||||
COMMIT_HOLD(hold);
|
||||
int ret;
|
||||
@@ -1976,8 +2012,7 @@ static int server_srch_commit_compact(struct super_block *sb,
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
u64 rid = scoutfs_net_client_rid(conn);
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_srch_compact *sc;
|
||||
struct scoutfs_alloc_list_head av;
|
||||
struct scoutfs_alloc_list_head fr;
|
||||
@@ -2046,28 +2081,48 @@ out:
|
||||
* reset the next range key if there's still work to do. If the
|
||||
* operation is complete then we tear down the input log_trees items and
|
||||
* delete the status.
|
||||
*
|
||||
* Processing all the completions can take more than one transaction.
|
||||
* We return -EINPROGRESS if we have to commit a transaction and the
|
||||
* caller will apply the commit and immediate call back in so we can
|
||||
* perform another commit. We need to be very careful to leave the
|
||||
* status in a state where requests won't be issued at the wrong time
|
||||
* (by forcing nr_completions to a batch while we delete them).
|
||||
*/
|
||||
static int splice_log_merge_completions(struct super_block *sb,
|
||||
struct scoutfs_log_merge_status *stat,
|
||||
bool no_ranges)
|
||||
{
|
||||
struct server_info *server = SCOUTFS_SB(sb)->server_info;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_log_merge_complete comp;
|
||||
struct scoutfs_log_merge_freeing fr;
|
||||
struct scoutfs_log_merge_range rng;
|
||||
struct scoutfs_log_trees lt = {{{0,}}};
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
bool upd_stat = true;
|
||||
int einprogress = 0;
|
||||
struct scoutfs_key key;
|
||||
char *err_str = NULL;
|
||||
u32 alloc_low;
|
||||
u32 tmp;
|
||||
u64 seq;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
/* musn't rebalance fs tree parents while reqs rely on their key bounds */
|
||||
if (WARN_ON_ONCE(le64_to_cpu(stat->nr_requests) > 0))
|
||||
return -EIO;
|
||||
|
||||
/*
|
||||
* Be overly conservative about how low the allocator can get
|
||||
* before we commit. This gives us a lot of work to do in a
|
||||
* commit while also allowing a pretty big smallest allocator to
|
||||
* work with the theoretically unbounded alloc list splicing.
|
||||
*/
|
||||
scoutfs_alloc_meta_remaining(&server->alloc, &alloc_low, &tmp);
|
||||
alloc_low = min(alloc_low, tmp) / 4;
|
||||
|
||||
/*
|
||||
* Splice in all the completed subtrees at the initial parent
|
||||
* blocks in the main fs_tree before rebalancing any of them.
|
||||
@@ -2089,6 +2144,22 @@ static int splice_log_merge_completions(struct super_block *sb,
|
||||
|
||||
seq = le64_to_cpu(comp.seq);
|
||||
|
||||
/*
|
||||
* Use having cleared the lists as an indication that
|
||||
* we've already set the parents and don't need to dirty
|
||||
* the btree blocks to do it all over again. This is
|
||||
* safe because there is always an fs block that the
|
||||
* merge dirties and frees into the meta_freed list.
|
||||
*/
|
||||
if (comp.meta_avail.ref.blkno == 0 && comp.meta_freed.ref.blkno == 0)
|
||||
continue;
|
||||
|
||||
if (scoutfs_alloc_meta_low(sb, &server->alloc, alloc_low)) {
|
||||
einprogress = -EINPROGRESS;
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = scoutfs_btree_set_parent(sb, &server->alloc, &server->wri,
|
||||
&super->fs_root, &comp.start,
|
||||
&comp.root);
|
||||
@@ -2123,6 +2194,14 @@ static int splice_log_merge_completions(struct super_block *sb,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Once we start rebalancing we force the number of completions
|
||||
* to a batch so that requests won't be issued. Once we're done
|
||||
* we clear the completion count and requests can flow again.
|
||||
*/
|
||||
if (le64_to_cpu(stat->nr_complete) < LOG_MERGE_SPLICE_BATCH)
|
||||
stat->nr_complete = cpu_to_le64(LOG_MERGE_SPLICE_BATCH);
|
||||
|
||||
/*
|
||||
* Now with all the parent blocks spliced in, rebalance items
|
||||
* amongst parents that needed to split/join and delete the
|
||||
@@ -2144,6 +2223,12 @@ static int splice_log_merge_completions(struct super_block *sb,
|
||||
|
||||
seq = le64_to_cpu(comp.seq);
|
||||
|
||||
if (scoutfs_alloc_meta_low(sb, &server->alloc, alloc_low)) {
|
||||
einprogress = -EINPROGRESS;
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* balance when there was a remaining key range */
|
||||
if (le64_to_cpu(comp.flags) & SCOUTFS_LOG_MERGE_COMP_REMAIN) {
|
||||
ret = scoutfs_btree_rebalance(sb, &server->alloc,
|
||||
@@ -2183,18 +2268,11 @@ static int splice_log_merge_completions(struct super_block *sb,
|
||||
}
|
||||
}
|
||||
|
||||
/* update the status once all completes are processed */
|
||||
scoutfs_key_set_zeros(&stat->next_range_key);
|
||||
stat->nr_complete = 0;
|
||||
|
||||
/* update counts and done if there's still ranges to process */
|
||||
if (!no_ranges) {
|
||||
init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0);
|
||||
ret = scoutfs_btree_update(sb, &server->alloc, &server->wri,
|
||||
&super->log_merge, &key,
|
||||
stat, sizeof(*stat));
|
||||
if (ret < 0)
|
||||
err_str = "update status";
|
||||
scoutfs_key_set_zeros(&stat->next_range_key);
|
||||
stat->nr_complete = 0;
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -2230,6 +2308,12 @@ static int splice_log_merge_completions(struct super_block *sb,
|
||||
(le64_to_cpu(lt.finalize_seq) < le64_to_cpu(stat->seq))))
|
||||
continue;
|
||||
|
||||
if (scoutfs_alloc_meta_low(sb, &server->alloc, alloc_low)) {
|
||||
einprogress = -EINPROGRESS;
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
fr.root = lt.item_root;
|
||||
scoutfs_key_set_zeros(&fr.key);
|
||||
fr.seq = cpu_to_le64(scoutfs_server_next_seq(sb));
|
||||
@@ -2263,9 +2347,10 @@ static int splice_log_merge_completions(struct super_block *sb,
|
||||
}
|
||||
|
||||
le64_add_cpu(&super->inode_count, le64_to_cpu(lt.inode_count_delta));
|
||||
|
||||
}
|
||||
|
||||
/* everything's done, remove the merge operation */
|
||||
upd_stat = false;
|
||||
init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0);
|
||||
ret = scoutfs_btree_delete(sb, &server->alloc, &server->wri,
|
||||
&super->log_merge, &key);
|
||||
@@ -2274,12 +2359,23 @@ static int splice_log_merge_completions(struct super_block *sb,
|
||||
else
|
||||
err_str = "deleting merge status item";
|
||||
out:
|
||||
if (upd_stat) {
|
||||
init_log_merge_key(&key, SCOUTFS_LOG_MERGE_STATUS_ZONE, 0, 0);
|
||||
err = scoutfs_btree_update(sb, &server->alloc, &server->wri,
|
||||
&super->log_merge, &key,
|
||||
stat, sizeof(struct scoutfs_log_merge_status));
|
||||
if (err && !ret) {
|
||||
err_str = "updating merge status item";
|
||||
ret = err;
|
||||
}
|
||||
}
|
||||
|
||||
if (ret < 0)
|
||||
scoutfs_err(sb, "server error %d splicing log merge completion: %s", ret, err_str);
|
||||
|
||||
BUG_ON(ret); /* inconsistent */
|
||||
|
||||
return ret;
|
||||
return ret ?: einprogress;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2370,7 +2466,7 @@ static void server_log_merge_free_work(struct work_struct *work)
|
||||
struct server_info *server = container_of(work, struct server_info,
|
||||
log_merge_free_work);
|
||||
struct super_block *sb = server->sb;
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_log_merge_freeing fr;
|
||||
struct scoutfs_key key;
|
||||
COMMIT_HOLD(hold);
|
||||
@@ -2454,6 +2550,12 @@ static void server_log_merge_free_work(struct work_struct *work)
|
||||
}
|
||||
|
||||
/*
|
||||
* Clients regularly ask if there is log merge work to do. We process
|
||||
* completions inline before responding so that we don't create large
|
||||
* delays between completion processing and the next request. We don't
|
||||
* mind if the client get_log_merge request sees high latency, the
|
||||
* blocked caller has nothing else to do.
|
||||
*
|
||||
* This will return ENOENT to the client if there is no work to do.
|
||||
*/
|
||||
static int server_get_log_merge(struct super_block *sb,
|
||||
@@ -2462,8 +2564,7 @@ static int server_get_log_merge(struct super_block *sb,
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
u64 rid = scoutfs_net_client_rid(conn);
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_log_merge_status stat;
|
||||
struct scoutfs_log_merge_range rng;
|
||||
struct scoutfs_log_merge_range remain;
|
||||
@@ -2522,14 +2623,22 @@ restart:
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* maybe splice now that we know if there's ranges */
|
||||
/* splice if we have a batch or ran out of ranges */
|
||||
no_next = ret == -ENOENT;
|
||||
no_ranges = scoutfs_key_is_zeros(&stat.next_range_key) && ret == -ENOENT;
|
||||
if (le64_to_cpu(stat.nr_requests) == 0 &&
|
||||
(no_next || le64_to_cpu(stat.nr_complete) >= LOG_MERGE_SPLICE_BATCH)) {
|
||||
ret = splice_log_merge_completions(sb, &stat, no_ranges);
|
||||
if (ret < 0)
|
||||
if (ret == -EINPROGRESS) {
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
ret = server_apply_commit(sb, &hold, 0);
|
||||
if (ret < 0)
|
||||
goto respond;
|
||||
server_hold_commit(sb, &hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
} else if (ret < 0) {
|
||||
goto out;
|
||||
}
|
||||
/* splicing resets key and adds ranges, could finish status */
|
||||
goto restart;
|
||||
}
|
||||
@@ -2731,6 +2840,7 @@ out:
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
|
||||
respond:
|
||||
return scoutfs_net_response(sb, conn, cmd, id, ret, &req, sizeof(req));
|
||||
}
|
||||
|
||||
@@ -2746,8 +2856,7 @@ static int server_commit_log_merge(struct super_block *sb,
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
u64 rid = scoutfs_net_client_rid(conn);
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_log_merge_request orig_req;
|
||||
struct scoutfs_log_merge_complete *comp;
|
||||
struct scoutfs_log_merge_status stat;
|
||||
@@ -2982,7 +3091,7 @@ static int server_set_volopt(struct super_block *sb, struct scoutfs_net_connecti
|
||||
u8 cmd, u64 id, void *arg, u16 arg_len)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_volume_options *volopt;
|
||||
COMMIT_HOLD(hold);
|
||||
u64 opt;
|
||||
@@ -3051,7 +3160,7 @@ static int server_clear_volopt(struct super_block *sb, struct scoutfs_net_connec
|
||||
u8 cmd, u64 id, void *arg, u16 arg_len)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_volume_options *volopt;
|
||||
COMMIT_HOLD(hold);
|
||||
__le64 *opt;
|
||||
@@ -3105,7 +3214,7 @@ static int server_resize_devices(struct super_block *sb, struct scoutfs_net_conn
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_net_resize_devices *nrd;
|
||||
COMMIT_HOLD(hold);
|
||||
u64 meta_tot;
|
||||
@@ -3212,16 +3321,19 @@ static int count_free_blocks(struct super_block *sb, void *arg, int owner,
|
||||
}
|
||||
|
||||
/*
|
||||
* We calculate the total inode count and free blocks from the current in-memory dirty
|
||||
* versions of the super block and log_trees structs, so we have to lock them.
|
||||
* We calculate the total inode count and free blocks from the last
|
||||
* stable super that was written. Other users also walk stable blocks
|
||||
* so by joining them we don't have to worry about ensuring that we've
|
||||
* locked all the dirty structures that the summations could reference.
|
||||
* We handle stale reads by retrying with the most recent stable super.
|
||||
*/
|
||||
static int server_statfs(struct super_block *sb, struct scoutfs_net_connection *conn,
|
||||
u8 cmd, u64 id, void *arg, u16 arg_len)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block super;
|
||||
struct scoutfs_net_statfs nst = {{0,}};
|
||||
struct statfs_free_blocks sfb = {0,};
|
||||
DECLARE_SAVED_REFS(saved);
|
||||
u64 inode_count;
|
||||
int ret;
|
||||
|
||||
@@ -3230,24 +3342,24 @@ static int server_statfs(struct super_block *sb, struct scoutfs_net_connection *
|
||||
goto out;
|
||||
}
|
||||
|
||||
mutex_lock(&server->alloc_mutex);
|
||||
ret = scoutfs_alloc_foreach_super(sb, super, count_free_blocks, &sfb);
|
||||
mutex_unlock(&server->alloc_mutex);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
do {
|
||||
get_stable(sb, &super, NULL);
|
||||
|
||||
mutex_lock(&server->logs_mutex);
|
||||
ret = scoutfs_forest_inode_count(sb, super, &inode_count);
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
ret = scoutfs_alloc_foreach_super(sb, &super, count_free_blocks, &sfb) ?:
|
||||
scoutfs_forest_inode_count(sb, &super, &inode_count);
|
||||
if (ret < 0 && ret != -ESTALE)
|
||||
goto out;
|
||||
|
||||
BUILD_BUG_ON(sizeof(nst.uuid) != sizeof(super->uuid));
|
||||
memcpy(nst.uuid, super->uuid, sizeof(nst.uuid));
|
||||
ret = scoutfs_block_check_stale(sb, ret, &saved, &super.logs_root.ref,
|
||||
&super.srch_root.ref);
|
||||
} while (ret == -ESTALE);
|
||||
|
||||
BUILD_BUG_ON(sizeof(nst.uuid) != sizeof(super.uuid));
|
||||
memcpy(nst.uuid, super.uuid, sizeof(nst.uuid));
|
||||
nst.free_meta_blocks = cpu_to_le64(sfb.meta);
|
||||
nst.total_meta_blocks = super->total_meta_blocks;
|
||||
nst.total_meta_blocks = super.total_meta_blocks;
|
||||
nst.free_data_blocks = cpu_to_le64(sfb.data);
|
||||
nst.total_data_blocks = super->total_data_blocks;
|
||||
nst.total_data_blocks = super.total_data_blocks;
|
||||
nst.inode_count = cpu_to_le64(inode_count);
|
||||
|
||||
ret = 0;
|
||||
@@ -3278,7 +3390,7 @@ static int insert_mounted_client(struct super_block *sb, u64 rid, u64 gr_flags,
|
||||
struct sockaddr_in *sin)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_mounted_client_btree_val mcv;
|
||||
struct scoutfs_key key;
|
||||
int ret;
|
||||
@@ -3304,7 +3416,7 @@ static int lookup_mounted_client_addr(struct super_block *sb, u64 rid,
|
||||
union scoutfs_inet_addr *addr)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_mounted_client_btree_val *mcv;
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
struct scoutfs_key key;
|
||||
@@ -3338,7 +3450,7 @@ static int lookup_mounted_client_addr(struct super_block *sb, u64 rid,
|
||||
static int delete_mounted_client(struct super_block *sb, u64 rid)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_key key;
|
||||
int ret;
|
||||
|
||||
@@ -3362,7 +3474,7 @@ static int delete_mounted_client(struct super_block *sb, u64 rid)
|
||||
static int cancel_srch_compact(struct super_block *sb, u64 rid)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_alloc_list_head av;
|
||||
struct scoutfs_alloc_list_head fr;
|
||||
int ret;
|
||||
@@ -3414,7 +3526,7 @@ static int cancel_srch_compact(struct super_block *sb, u64 rid)
|
||||
static int cancel_log_merge(struct super_block *sb, u64 rid)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_log_merge_status stat;
|
||||
struct scoutfs_log_merge_request req;
|
||||
struct scoutfs_log_merge_range rng;
|
||||
@@ -3538,7 +3650,7 @@ static int server_greeting(struct super_block *sb,
|
||||
u8 cmd, u64 id, void *arg, u16 arg_len)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_net_greeting *gr = arg;
|
||||
struct scoutfs_net_greeting greet;
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
@@ -3554,10 +3666,9 @@ static int server_greeting(struct super_block *sb,
|
||||
goto send_err;
|
||||
}
|
||||
|
||||
if (gr->fsid != super->hdr.fsid) {
|
||||
if (gr->fsid != cpu_to_le64(sbi->fsid)) {
|
||||
scoutfs_warn(sb, "client rid %016llx greeting fsid 0x%llx did not match server fsid 0x%llx",
|
||||
le64_to_cpu(gr->rid), le64_to_cpu(gr->fsid),
|
||||
le64_to_cpu(super->hdr.fsid));
|
||||
le64_to_cpu(gr->rid), le64_to_cpu(gr->fsid), sbi->fsid);
|
||||
ret = -EINVAL;
|
||||
goto send_err;
|
||||
}
|
||||
@@ -3697,7 +3808,7 @@ static void farewell_worker(struct work_struct *work)
|
||||
struct server_info *server = container_of(work, struct server_info,
|
||||
farewell_work);
|
||||
struct super_block *sb = server->sb;
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_mounted_client_btree_val *mcv;
|
||||
struct farewell_request *tmp;
|
||||
struct farewell_request *fw;
|
||||
@@ -4059,7 +4170,7 @@ static void recovery_timeout(struct super_block *sb)
|
||||
static int start_recovery(struct super_block *sb)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
struct scoutfs_key key;
|
||||
unsigned int nr = 0;
|
||||
@@ -4176,8 +4287,7 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
struct server_info *server = container_of(work, struct server_info,
|
||||
work);
|
||||
struct super_block *sb = server->sb;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct scoutfs_super_block *super = &sbi->super;
|
||||
struct scoutfs_super_block *super = DIRTY_SUPER_SB(sb);
|
||||
struct scoutfs_net_connection *conn = NULL;
|
||||
struct scoutfs_mount_options opts;
|
||||
DECLARE_WAIT_QUEUE_HEAD(waitq);
|
||||
@@ -4189,13 +4299,13 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
trace_scoutfs_server_work_enter(sb, 0, 0);
|
||||
|
||||
scoutfs_options_read(sb, &opts);
|
||||
scoutfs_quorum_slot_sin(super, opts.quorum_slot_nr, &sin);
|
||||
scoutfs_quorum_slot_sin(&server->qconf, opts.quorum_slot_nr, &sin);
|
||||
scoutfs_info(sb, "server starting at "SIN_FMT, SIN_ARG(&sin));
|
||||
|
||||
scoutfs_block_writer_init(sb, &server->wri);
|
||||
|
||||
/* first make sure no other servers are still running */
|
||||
ret = scoutfs_quorum_fence_leaders(sb, server->term);
|
||||
ret = scoutfs_quorum_fence_leaders(sb, &server->qconf, server->term);
|
||||
if (ret < 0) {
|
||||
scoutfs_err(sb, "server error %d attempting to fence previous leaders", ret);
|
||||
goto out;
|
||||
@@ -4231,8 +4341,7 @@ static void scoutfs_server_worker(struct work_struct *work)
|
||||
write_seqcount_end(&server->volopt_seqcount);
|
||||
|
||||
atomic64_set(&server->seq_atomic, le64_to_cpu(super->seq));
|
||||
set_roots(server, &super->fs_root, &super->logs_root,
|
||||
&super->srch_root);
|
||||
set_stable_super(server, super);
|
||||
|
||||
/* prepare server alloc for this transaction, larger first */
|
||||
if (le64_to_cpu(super->server_meta_avail[0].total_nr) <
|
||||
@@ -4326,11 +4435,12 @@ out:
|
||||
/*
|
||||
* Start the server but don't wait for it to complete.
|
||||
*/
|
||||
void scoutfs_server_start(struct super_block *sb, u64 term)
|
||||
void scoutfs_server_start(struct super_block *sb, struct scoutfs_quorum_config *qconf, u64 term)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
if (cmpxchg(&server->status, SERVER_DOWN, SERVER_STARTING) == SERVER_DOWN) {
|
||||
server->qconf = *qconf;
|
||||
server->term = term;
|
||||
queue_work(server->wq, &server->work);
|
||||
}
|
||||
@@ -4382,7 +4492,7 @@ int scoutfs_server_setup(struct super_block *sb)
|
||||
INIT_WORK(&server->log_merge_free_work, server_log_merge_free_work);
|
||||
mutex_init(&server->srch_mutex);
|
||||
mutex_init(&server->mounted_clients_mutex);
|
||||
seqcount_init(&server->roots_seqcount);
|
||||
seqcount_init(&server->stable_seqcount);
|
||||
seqcount_init(&server->volopt_seqcount);
|
||||
mutex_init(&server->volopt_mutex);
|
||||
INIT_WORK(&server->fence_pending_recov_work, fence_pending_recov_worker);
|
||||
|
||||
@@ -75,7 +75,7 @@ u64 scoutfs_server_seq(struct super_block *sb);
|
||||
u64 scoutfs_server_next_seq(struct super_block *sb);
|
||||
void scoutfs_server_set_seq_if_greater(struct super_block *sb, u64 seq);
|
||||
|
||||
void scoutfs_server_start(struct super_block *sb, u64 term);
|
||||
void scoutfs_server_start(struct super_block *sb, struct scoutfs_quorum_config *qconf, u64 term);
|
||||
void scoutfs_server_stop(struct super_block *sb);
|
||||
void scoutfs_server_stop_wait(struct super_block *sb);
|
||||
bool scoutfs_server_is_running(struct super_block *sb);
|
||||
|
||||
@@ -861,7 +861,6 @@ int scoutfs_srch_search_xattrs(struct super_block *sb,
|
||||
struct scoutfs_srch_rb_root *sroot,
|
||||
u64 hash, u64 ino, u64 last_ino, bool *done)
|
||||
{
|
||||
struct scoutfs_net_roots prev_roots;
|
||||
struct scoutfs_net_roots roots;
|
||||
struct scoutfs_srch_entry start;
|
||||
struct scoutfs_srch_entry end;
|
||||
@@ -869,6 +868,7 @@ int scoutfs_srch_search_xattrs(struct super_block *sb,
|
||||
struct scoutfs_log_trees lt;
|
||||
struct scoutfs_srch_file sfl;
|
||||
SCOUTFS_BTREE_ITEM_REF(iref);
|
||||
DECLARE_SAVED_REFS(saved);
|
||||
struct scoutfs_key key;
|
||||
unsigned long limit = SRCH_LIMIT;
|
||||
int ret;
|
||||
@@ -877,7 +877,6 @@ int scoutfs_srch_search_xattrs(struct super_block *sb,
|
||||
|
||||
*done = false;
|
||||
srch_init_rb_root(sroot);
|
||||
memset(&prev_roots, 0, sizeof(prev_roots));
|
||||
|
||||
start.hash = cpu_to_le64(hash);
|
||||
start.ino = cpu_to_le64(ino);
|
||||
@@ -892,7 +891,6 @@ retry:
|
||||
ret = scoutfs_client_get_roots(sb, &roots);
|
||||
if (ret)
|
||||
goto out;
|
||||
memset(&roots.fs_root, 0, sizeof(roots.fs_root));
|
||||
|
||||
end = final;
|
||||
|
||||
@@ -968,16 +966,10 @@ retry:
|
||||
*done = sre_cmp(&end, &final) == 0;
|
||||
ret = 0;
|
||||
out:
|
||||
if (ret == -ESTALE) {
|
||||
if (memcmp(&prev_roots, &roots, sizeof(roots)) == 0) {
|
||||
scoutfs_inc_counter(sb, srch_search_stale_eio);
|
||||
ret = -EIO;
|
||||
} else {
|
||||
scoutfs_inc_counter(sb, srch_search_stale_retry);
|
||||
prev_roots = roots;
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
ret = scoutfs_block_check_stale(sb, ret, &saved, &roots.srch_root.ref,
|
||||
&roots.logs_root.ref);
|
||||
if (ret == -ESTALE)
|
||||
goto retry;
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -1003,6 +995,14 @@ int scoutfs_srch_rotate_log(struct super_block *sb,
|
||||
le64_to_cpu(sfl->ref.blkno), 0);
|
||||
ret = scoutfs_btree_insert(sb, alloc, wri, root, &key,
|
||||
sfl, sizeof(*sfl));
|
||||
/*
|
||||
* While it's fine to replay moving the client's logging srch
|
||||
* file to the core btree item, server commits should keep it
|
||||
* from happening. So we'll warn if we see it happen. This can
|
||||
* be removed eventually.
|
||||
*/
|
||||
if (WARN_ON_ONCE(ret == -EEXIST))
|
||||
ret = 0;
|
||||
if (ret == 0) {
|
||||
memset(sfl, 0, sizeof(*sfl));
|
||||
scoutfs_inc_counter(sb, srch_rotate_log);
|
||||
|
||||
@@ -461,9 +461,8 @@ static int scoutfs_read_supers(struct super_block *sb)
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
||||
sbi->fsid = le64_to_cpu(meta_super->hdr.fsid);
|
||||
sbi->fmt_vers = le64_to_cpu(meta_super->fmt_vers);
|
||||
sbi->super = *meta_super;
|
||||
out:
|
||||
kfree(meta_super);
|
||||
kfree(data_super);
|
||||
@@ -487,6 +486,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
sb->s_export_op = &scoutfs_export_ops;
|
||||
sb->s_xattr = scoutfs_xattr_handlers;
|
||||
sb->s_flags |= MS_I_VERSION | MS_POSIXACL;
|
||||
sb->s_time_gran = 1;
|
||||
|
||||
/* btree blocks use long lived bh->b_data refs */
|
||||
mapping_set_gfp_mask(sb->s_bdev->bd_inode->i_mapping, GFP_NOFS);
|
||||
|
||||
@@ -35,11 +35,10 @@ struct scoutfs_sb_info {
|
||||
struct super_block *sb;
|
||||
|
||||
/* assigned once at the start of each mount, read-only */
|
||||
u64 fsid;
|
||||
u64 rid;
|
||||
u64 fmt_vers;
|
||||
|
||||
struct scoutfs_super_block super;
|
||||
|
||||
struct block_device *meta_bdev;
|
||||
|
||||
spinlock_t next_ino_lock;
|
||||
@@ -135,14 +134,14 @@ static inline bool scoutfs_unmounting(struct super_block *sb)
|
||||
(int)(le64_to_cpu(fsid) >> SCSB_SHIFT), \
|
||||
(int)(le64_to_cpu(rid) >> SCSB_SHIFT)
|
||||
#define SCSB_ARGS(sb) \
|
||||
(int)(le64_to_cpu(SCOUTFS_SB(sb)->super.hdr.fsid) >> SCSB_SHIFT), \
|
||||
(int)(SCOUTFS_SB(sb)->fsid >> SCSB_SHIFT), \
|
||||
(int)(SCOUTFS_SB(sb)->rid >> SCSB_SHIFT)
|
||||
#define SCSB_TRACE_FIELDS \
|
||||
__field(__u64, fsid) \
|
||||
__field(__u64, rid)
|
||||
#define SCSB_TRACE_ASSIGN(sb) \
|
||||
__entry->fsid = SCOUTFS_HAS_SBI(sb) ? \
|
||||
le64_to_cpu(SCOUTFS_SB(sb)->super.hdr.fsid) : 0;\
|
||||
SCOUTFS_SB(sb)->fsid : 0; \
|
||||
__entry->rid = SCOUTFS_HAS_SBI(sb) ? \
|
||||
SCOUTFS_SB(sb)->rid : 0;
|
||||
#define SCSB_TRACE_ARGS \
|
||||
|
||||
@@ -60,10 +60,9 @@ static ssize_t fsid_show(struct kobject *kobj, struct attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct super_block *sb = KOBJ_TO_SB(kobj, sb_id_kobj);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%016llx\n",
|
||||
le64_to_cpu(super->hdr.fsid));
|
||||
return snprintf(buf, PAGE_SIZE, "%016llx\n", sbi->fsid);
|
||||
}
|
||||
ATTR_FUNCS_RO(fsid);
|
||||
|
||||
|
||||
1
tests/.gitignore
vendored
1
tests/.gitignore
vendored
@@ -8,3 +8,4 @@ src/bulk_create_paths
|
||||
src/find_xattrs
|
||||
src/stage_tmpfile
|
||||
src/create_xattr_loop
|
||||
src/o_tmpfile_umask
|
||||
|
||||
@@ -11,7 +11,8 @@ BIN := src/createmany \
|
||||
src/stage_tmpfile \
|
||||
src/find_xattrs \
|
||||
src/create_xattr_loop \
|
||||
src/fragmented_data_extents
|
||||
src/fragmented_data_extents \
|
||||
src/o_tmpfile_umask
|
||||
|
||||
DEPS := $(wildcard src/*.d)
|
||||
|
||||
|
||||
@@ -39,6 +39,18 @@ t_quiet()
|
||||
t_fail "quiet command failed"
|
||||
}
|
||||
|
||||
#
|
||||
# Quietly run a command during a test. The output is logged but only
|
||||
# the return code is printed, presumably because the output contains
|
||||
# a lot of invocation specific text that is difficult to filter.
|
||||
#
|
||||
t_rc()
|
||||
{
|
||||
echo "# $*" >> "$T_TMP.rc.log"
|
||||
"$@" >> "$T_TMP.rc.log" 2>&1
|
||||
echo "rc: $?"
|
||||
}
|
||||
|
||||
#
|
||||
# redirect test output back to the output of the invoking script intead
|
||||
# of the compared output.
|
||||
|
||||
@@ -18,6 +18,7 @@ t_filter_dmesg()
|
||||
|
||||
# the kernel can just be noisy
|
||||
re=" used greatest stack depth: "
|
||||
re="$re|sched: RT throttling activated"
|
||||
|
||||
# mkfs/mount checks partition tables
|
||||
re="$re|unknown partition table"
|
||||
@@ -61,6 +62,7 @@ t_filter_dmesg()
|
||||
re="$re|scoutfs .* error: meta_super META flag not set"
|
||||
re="$re|scoutfs .* error: could not open metadev:.*"
|
||||
re="$re|scoutfs .* error: Unknown or malformed option,.*"
|
||||
re="$re|scoutfs .* error: invalid quorum_heartbeat_timeout_ms value"
|
||||
|
||||
# in debugging kernels we can slow things down a bit
|
||||
re="$re|hrtimer: interrupt took .*"
|
||||
@@ -81,6 +83,7 @@ t_filter_dmesg()
|
||||
re="$re|scoutfs .* error .* freeing merged btree blocks.*.final commit del.upd freeing item"
|
||||
re="$re|scoutfs .* error .*reading quorum block.*to update event.*"
|
||||
re="$re|scoutfs .* error.*server failed to bind to.*"
|
||||
re="$re|scoutfs .* critical transaction commit failure.*"
|
||||
|
||||
egrep -v "($re)"
|
||||
}
|
||||
|
||||
@@ -75,6 +75,15 @@ t_fs_nrs()
|
||||
seq 0 $((T_NR_MOUNTS - 1))
|
||||
}
|
||||
|
||||
#
|
||||
# output the fs nrs of quorum nodes, we "know" that
|
||||
# the quorum nrs are the first consequtive nrs
|
||||
#
|
||||
t_quorum_nrs()
|
||||
{
|
||||
seq 0 $((T_QUORUM - 1))
|
||||
}
|
||||
|
||||
#
|
||||
# outputs "1" if the fs number has "1" in its quorum/is_leader file.
|
||||
# All other cases output 0, including the fs nr being a client which
|
||||
@@ -144,7 +153,27 @@ t_mount()
|
||||
test "$nr" -lt "$T_NR_MOUNTS" || \
|
||||
t_fail "fs nr $nr invalid"
|
||||
|
||||
eval t_quiet mount -t scoutfs \$T_O$nr \$T_DB$nr \$T_M$nr
|
||||
eval t_quiet mount -t scoutfs \$T_O$nr\$opt \$T_DB$nr \$T_M$nr
|
||||
}
|
||||
|
||||
#
|
||||
# Mount with an optional mount option string. If the string is empty
|
||||
# then the saved mount options are used. If the string has contents
|
||||
# then it is appended to the end of the saved options with a separating
|
||||
# comma.
|
||||
#
|
||||
# Unlike t_mount this won't inherently fail in t_quiet, errors are
|
||||
# returned so bad options can be tested.
|
||||
#
|
||||
t_mount_opt()
|
||||
{
|
||||
local nr="$1"
|
||||
local opt="${2:+,$2}"
|
||||
|
||||
test "$nr" -lt "$T_NR_MOUNTS" || \
|
||||
t_fail "fs nr $nr invalid"
|
||||
|
||||
eval mount -t scoutfs \$T_O$nr\$opt \$T_DB$nr \$T_M$nr
|
||||
}
|
||||
|
||||
t_umount()
|
||||
@@ -391,7 +420,7 @@ t_set_sysfs_mount_option() {
|
||||
local val="$3"
|
||||
local opt="$(t_sysfs_path $nr)/mount_options/$name"
|
||||
|
||||
echo "$val" > "$opt"
|
||||
echo "$val" > "$opt" 2>/dev/null
|
||||
}
|
||||
|
||||
t_set_all_sysfs_mount_options() {
|
||||
|
||||
@@ -49,6 +49,7 @@ four
|
||||
--- can't overwrite non-empty dir
|
||||
mv: cannot move ‘/mnt/test/test/basic-posix-consistency/dir/c/clobber’ to ‘/mnt/test/test/basic-posix-consistency/dir/a/dir’: Directory not empty
|
||||
--- can overwrite empty dir
|
||||
--- can rename into root
|
||||
== path resoluion
|
||||
== inode indexes match after syncing existing
|
||||
== inode indexes match after copying and syncing
|
||||
|
||||
27
tests/golden/change-devices
Normal file
27
tests/golden/change-devices
Normal file
@@ -0,0 +1,27 @@
|
||||
== make tmp sparse data dev files
|
||||
== make scratch fs
|
||||
== small new data device fails
|
||||
rc: 1
|
||||
== check sees data device errors
|
||||
rc: 1
|
||||
rc: 0
|
||||
== preparing while mounted fails
|
||||
rc: 1
|
||||
== preparing without recovery fails
|
||||
rc: 1
|
||||
== check sees metadata errors
|
||||
rc: 1
|
||||
rc: 1
|
||||
== preparing with file data fails
|
||||
rc: 1
|
||||
== preparing after emptied
|
||||
rc: 0
|
||||
== checks pass
|
||||
rc: 0
|
||||
rc: 0
|
||||
== using prepared
|
||||
== preparing larger and resizing
|
||||
rc: 0
|
||||
equal_prepared
|
||||
large_prepared
|
||||
resized larger test rc: 0
|
||||
@@ -24,3 +24,307 @@
|
||||
/mnt/test/test/data-prealloc/file-2: 5 extents found
|
||||
/mnt/test/test/data-prealloc/file-1: 3 extents found
|
||||
/mnt/test/test/data-prealloc/file-2: 3 extents found
|
||||
== block writes into region allocs hole
|
||||
wrote blk 24
|
||||
wrote blk 32
|
||||
wrote blk 40
|
||||
wrote blk 55
|
||||
wrote blk 63
|
||||
wrote blk 71
|
||||
wrote blk 72
|
||||
wrote blk 79
|
||||
wrote blk 80
|
||||
wrote blk 87
|
||||
wrote blk 88
|
||||
wrote blk 95
|
||||
before:
|
||||
24.. 1:
|
||||
32.. 1:
|
||||
40.. 1:
|
||||
55.. 1:
|
||||
63.. 1:
|
||||
71.. 2:
|
||||
79.. 2:
|
||||
87.. 2:
|
||||
95.. 1: eof
|
||||
writing into existing 0 at pos 0
|
||||
wrote blk 0
|
||||
0.. 1:
|
||||
1.. 7: unwritten
|
||||
24.. 1:
|
||||
32.. 1:
|
||||
40.. 1:
|
||||
55.. 1:
|
||||
63.. 1:
|
||||
71.. 2:
|
||||
79.. 2:
|
||||
87.. 2:
|
||||
95.. 1: eof
|
||||
writing into existing 0 at pos 1
|
||||
wrote blk 15
|
||||
0.. 1:
|
||||
1.. 14: unwritten
|
||||
15.. 1:
|
||||
24.. 1:
|
||||
32.. 1:
|
||||
40.. 1:
|
||||
55.. 1:
|
||||
63.. 1:
|
||||
71.. 2:
|
||||
79.. 2:
|
||||
87.. 2:
|
||||
95.. 1: eof
|
||||
writing into existing 0 at pos 2
|
||||
wrote blk 19
|
||||
0.. 1:
|
||||
1.. 14: unwritten
|
||||
15.. 1:
|
||||
16.. 3: unwritten
|
||||
19.. 1:
|
||||
20.. 4: unwritten
|
||||
24.. 1:
|
||||
32.. 1:
|
||||
40.. 1:
|
||||
55.. 1:
|
||||
63.. 1:
|
||||
71.. 2:
|
||||
79.. 2:
|
||||
87.. 2:
|
||||
95.. 1: eof
|
||||
writing into existing 1 at pos 0
|
||||
wrote blk 25
|
||||
0.. 1:
|
||||
1.. 14: unwritten
|
||||
15.. 1:
|
||||
16.. 3: unwritten
|
||||
19.. 1:
|
||||
20.. 4: unwritten
|
||||
24.. 1:
|
||||
25.. 1:
|
||||
26.. 6: unwritten
|
||||
32.. 1:
|
||||
40.. 1:
|
||||
55.. 1:
|
||||
63.. 1:
|
||||
71.. 2:
|
||||
79.. 2:
|
||||
87.. 2:
|
||||
95.. 1: eof
|
||||
writing into existing 1 at pos 1
|
||||
wrote blk 39
|
||||
0.. 1:
|
||||
1.. 14: unwritten
|
||||
15.. 1:
|
||||
16.. 3: unwritten
|
||||
19.. 1:
|
||||
20.. 4: unwritten
|
||||
24.. 1:
|
||||
25.. 1:
|
||||
26.. 6: unwritten
|
||||
32.. 1:
|
||||
39.. 1:
|
||||
40.. 1:
|
||||
55.. 1:
|
||||
63.. 1:
|
||||
71.. 2:
|
||||
79.. 2:
|
||||
87.. 2:
|
||||
95.. 1: eof
|
||||
writing into existing 1 at pos 2
|
||||
wrote blk 44
|
||||
0.. 1:
|
||||
1.. 14: unwritten
|
||||
15.. 1:
|
||||
16.. 3: unwritten
|
||||
19.. 1:
|
||||
20.. 4: unwritten
|
||||
24.. 1:
|
||||
25.. 1:
|
||||
26.. 6: unwritten
|
||||
32.. 1:
|
||||
39.. 1:
|
||||
40.. 1:
|
||||
44.. 1:
|
||||
45.. 3: unwritten
|
||||
55.. 1:
|
||||
63.. 1:
|
||||
71.. 2:
|
||||
79.. 2:
|
||||
87.. 2:
|
||||
95.. 1: eof
|
||||
writing into existing 2 at pos 0
|
||||
wrote blk 48
|
||||
0.. 1:
|
||||
1.. 14: unwritten
|
||||
15.. 1:
|
||||
16.. 3: unwritten
|
||||
19.. 1:
|
||||
20.. 4: unwritten
|
||||
24.. 1:
|
||||
25.. 1:
|
||||
26.. 6: unwritten
|
||||
32.. 1:
|
||||
39.. 1:
|
||||
40.. 1:
|
||||
44.. 1:
|
||||
45.. 3: unwritten
|
||||
48.. 1:
|
||||
49.. 6: unwritten
|
||||
55.. 1:
|
||||
63.. 1:
|
||||
71.. 2:
|
||||
79.. 2:
|
||||
87.. 2:
|
||||
95.. 1: eof
|
||||
writing into existing 2 at pos 1
|
||||
wrote blk 62
|
||||
0.. 1:
|
||||
1.. 14: unwritten
|
||||
15.. 1:
|
||||
16.. 3: unwritten
|
||||
19.. 1:
|
||||
20.. 4: unwritten
|
||||
24.. 1:
|
||||
25.. 1:
|
||||
26.. 6: unwritten
|
||||
32.. 1:
|
||||
39.. 1:
|
||||
40.. 1:
|
||||
44.. 1:
|
||||
45.. 3: unwritten
|
||||
48.. 1:
|
||||
49.. 6: unwritten
|
||||
55.. 1:
|
||||
56.. 6: unwritten
|
||||
62.. 1:
|
||||
63.. 1:
|
||||
71.. 2:
|
||||
79.. 2:
|
||||
87.. 2:
|
||||
95.. 1: eof
|
||||
writing into existing 2 at pos 2
|
||||
wrote blk 67
|
||||
0.. 1:
|
||||
1.. 14: unwritten
|
||||
15.. 1:
|
||||
16.. 3: unwritten
|
||||
19.. 1:
|
||||
20.. 4: unwritten
|
||||
24.. 1:
|
||||
25.. 1:
|
||||
26.. 6: unwritten
|
||||
32.. 1:
|
||||
39.. 1:
|
||||
40.. 1:
|
||||
44.. 1:
|
||||
45.. 3: unwritten
|
||||
48.. 1:
|
||||
49.. 6: unwritten
|
||||
55.. 1:
|
||||
56.. 6: unwritten
|
||||
62.. 1:
|
||||
63.. 1:
|
||||
64.. 3: unwritten
|
||||
67.. 1:
|
||||
68.. 3: unwritten
|
||||
71.. 2:
|
||||
79.. 2:
|
||||
87.. 2:
|
||||
95.. 1: eof
|
||||
writing into existing 3 at pos 0
|
||||
wrote blk 73
|
||||
0.. 1:
|
||||
1.. 14: unwritten
|
||||
15.. 1:
|
||||
16.. 3: unwritten
|
||||
19.. 1:
|
||||
20.. 4: unwritten
|
||||
24.. 1:
|
||||
25.. 1:
|
||||
26.. 6: unwritten
|
||||
32.. 1:
|
||||
39.. 1:
|
||||
40.. 1:
|
||||
44.. 1:
|
||||
45.. 3: unwritten
|
||||
48.. 1:
|
||||
49.. 6: unwritten
|
||||
55.. 1:
|
||||
56.. 6: unwritten
|
||||
62.. 1:
|
||||
63.. 1:
|
||||
64.. 3: unwritten
|
||||
67.. 1:
|
||||
68.. 3: unwritten
|
||||
71.. 2:
|
||||
73.. 1:
|
||||
74.. 5: unwritten
|
||||
79.. 2:
|
||||
87.. 2:
|
||||
95.. 1: eof
|
||||
writing into existing 3 at pos 1
|
||||
wrote blk 86
|
||||
0.. 1:
|
||||
1.. 14: unwritten
|
||||
15.. 1:
|
||||
16.. 3: unwritten
|
||||
19.. 1:
|
||||
20.. 4: unwritten
|
||||
24.. 1:
|
||||
25.. 1:
|
||||
26.. 6: unwritten
|
||||
32.. 1:
|
||||
39.. 1:
|
||||
40.. 1:
|
||||
44.. 1:
|
||||
45.. 3: unwritten
|
||||
48.. 1:
|
||||
49.. 6: unwritten
|
||||
55.. 1:
|
||||
56.. 6: unwritten
|
||||
62.. 1:
|
||||
63.. 1:
|
||||
64.. 3: unwritten
|
||||
67.. 1:
|
||||
68.. 3: unwritten
|
||||
71.. 2:
|
||||
73.. 1:
|
||||
74.. 5: unwritten
|
||||
79.. 2:
|
||||
86.. 1:
|
||||
87.. 2:
|
||||
95.. 1: eof
|
||||
writing into existing 3 at pos 2
|
||||
wrote blk 92
|
||||
0.. 1:
|
||||
1.. 14: unwritten
|
||||
15.. 1:
|
||||
16.. 3: unwritten
|
||||
19.. 1:
|
||||
20.. 4: unwritten
|
||||
24.. 1:
|
||||
25.. 1:
|
||||
26.. 6: unwritten
|
||||
32.. 1:
|
||||
39.. 1:
|
||||
40.. 1:
|
||||
44.. 1:
|
||||
45.. 3: unwritten
|
||||
48.. 1:
|
||||
49.. 6: unwritten
|
||||
55.. 1:
|
||||
56.. 6: unwritten
|
||||
62.. 1:
|
||||
63.. 1:
|
||||
64.. 3: unwritten
|
||||
67.. 1:
|
||||
68.. 3: unwritten
|
||||
71.. 2:
|
||||
73.. 1:
|
||||
74.. 5: unwritten
|
||||
79.. 2:
|
||||
86.. 1:
|
||||
87.. 2:
|
||||
92.. 1:
|
||||
93.. 2: unwritten
|
||||
95.. 1: eof
|
||||
|
||||
18
tests/golden/get-referring-entries
Normal file
18
tests/golden/get-referring-entries
Normal file
@@ -0,0 +1,18 @@
|
||||
== root inode returns nothing
|
||||
== crazy large unused inode does nothing
|
||||
== basic entry
|
||||
file
|
||||
== rename
|
||||
renamed
|
||||
== hard link
|
||||
file
|
||||
link
|
||||
== removal
|
||||
== different dirs
|
||||
== file types
|
||||
type b name block
|
||||
type c name char
|
||||
type d name dir
|
||||
type f name file
|
||||
type l name symlink
|
||||
== all name lengths work
|
||||
@@ -1,3 +1,11 @@
|
||||
== non-acl O_TMPFILE creation honors umask
|
||||
umask 022
|
||||
fstat after open(0777): 0100755
|
||||
stat after linkat: 0100755
|
||||
umask 077
|
||||
fstat after open(0777): 0100700
|
||||
stat after linkat: 0100700
|
||||
== stage from tmpfile
|
||||
total file size 33669120
|
||||
00000000 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 41 |AAAAAAAAAAAAAAAA|
|
||||
*
|
||||
5
tests/golden/quorum-heartbeat-timeout
Normal file
5
tests/golden/quorum-heartbeat-timeout
Normal file
@@ -0,0 +1,5 @@
|
||||
== bad timeout values fail
|
||||
== bad mount option fails
|
||||
== mount option
|
||||
== sysfs
|
||||
== reset all options
|
||||
@@ -5,6 +5,7 @@ inode-items-updated.sh
|
||||
simple-inode-index.sh
|
||||
simple-staging.sh
|
||||
simple-release-extents.sh
|
||||
get-referring-entries.sh
|
||||
fallocate.sh
|
||||
basic-truncate.sh
|
||||
data-prealloc.sh
|
||||
@@ -27,7 +28,7 @@ createmany-large-names.sh
|
||||
createmany-rename-large-dir.sh
|
||||
stage-release-race-alloc.sh
|
||||
stage-multi-part.sh
|
||||
stage-tmpfile.sh
|
||||
o_tmpfile.sh
|
||||
basic-posix-consistency.sh
|
||||
dirent-consistency.sh
|
||||
mkdir-rename-rmdir.sh
|
||||
@@ -36,7 +37,9 @@ cross-mount-data-free.sh
|
||||
persistent-item-vers.sh
|
||||
setup-error-teardown.sh
|
||||
resize-devices.sh
|
||||
change-devices.sh
|
||||
fence-and-reclaim.sh
|
||||
quorum-heartbeat-timeout.sh
|
||||
orphan-inodes.sh
|
||||
mount-unmount-race.sh
|
||||
client-unmount-recovery.sh
|
||||
|
||||
97
tests/src/o_tmpfile_umask.c
Normal file
97
tests/src/o_tmpfile_umask.c
Normal file
@@ -0,0 +1,97 @@
|
||||
/*
|
||||
* Show the modes of files as we create them with O_TMPFILE and link
|
||||
* them into the namespace.
|
||||
*
|
||||
* Copyright (C) 2022 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <sys/stat.h>
|
||||
#include <assert.h>
|
||||
#include <limits.h>
|
||||
|
||||
static void linkat_tmpfile_modes(char *dir, char *lpath, mode_t mode)
|
||||
{
|
||||
char proc_self[PATH_MAX];
|
||||
struct stat st;
|
||||
int ret;
|
||||
int fd;
|
||||
|
||||
umask(mode);
|
||||
printf("umask 0%o\n", mode);
|
||||
|
||||
fd = open(dir, O_RDWR | O_TMPFILE, 0777);
|
||||
if (fd < 0) {
|
||||
perror("open(O_TMPFILE)");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
ret = fstat(fd, &st);
|
||||
if (ret < 0) {
|
||||
perror("fstat");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
printf("fstat after open(0777): 0%o\n", st.st_mode);
|
||||
|
||||
snprintf(proc_self, sizeof(proc_self), "/proc/self/fd/%d", fd);
|
||||
|
||||
ret = linkat(AT_FDCWD, proc_self, AT_FDCWD, lpath, AT_SYMLINK_FOLLOW);
|
||||
if (ret < 0) {
|
||||
perror("linkat");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
close(fd);
|
||||
|
||||
ret = stat(lpath, &st);
|
||||
if (ret < 0) {
|
||||
perror("fstat");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
printf("stat after linkat: 0%o\n", st.st_mode);
|
||||
|
||||
ret = unlink(lpath);
|
||||
if (ret < 0) {
|
||||
perror("unlink");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
char *lpath;
|
||||
char *dir;
|
||||
|
||||
if (argc < 3) {
|
||||
printf("%s <open_dir> <linkat_path>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
dir = argv[1];
|
||||
lpath = argv[2];
|
||||
|
||||
linkat_tmpfile_modes(dir, lpath, 022);
|
||||
linkat_tmpfile_modes(dir, lpath, 077);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -12,7 +12,7 @@ mount_fail()
|
||||
}
|
||||
|
||||
echo "== prepare devices, mount point, and logs"
|
||||
SCR="/mnt/scoutfs.extra"
|
||||
SCR="$T_TMPDIR/mnt.scratch"
|
||||
mkdir -p "$SCR"
|
||||
> $T_TMP.mount.out
|
||||
scoutfs mkfs -f -Q 0,127.0.0.1,53000 "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 \
|
||||
|
||||
@@ -149,6 +149,10 @@ find "$T_D0/dir" -ls 2>&1 | t_filter_fs > "$T_TMP.0"
|
||||
find "$T_D1/dir" -ls 2>&1 | t_filter_fs > "$T_TMP.1"
|
||||
diff -u "$T_TMP.0" "$T_TMP.1"
|
||||
rm -rf "$T_D0/dir"
|
||||
echo "--- can rename into root"
|
||||
touch "$T_D0/rename-into-root"
|
||||
mv "$T_D0/rename-into-root" "$T_M0/"
|
||||
rm -f "$T_M0/rename-into-root"
|
||||
|
||||
echo "== path resoluion"
|
||||
touch "$T_D0/file"
|
||||
|
||||
76
tests/tests/change-devices.sh
Normal file
76
tests/tests/change-devices.sh
Normal file
@@ -0,0 +1,76 @@
|
||||
#
|
||||
# test changing devices
|
||||
#
|
||||
|
||||
echo "== make tmp sparse data dev files"
|
||||
sz=$(blockdev --getsize64 "$T_EX_DATA_DEV")
|
||||
large_sz=$((sz * 2))
|
||||
touch "$T_TMP."{small,equal,large}
|
||||
truncate -s 1MB "$T_TMP.small"
|
||||
truncate -s $sz "$T_TMP.equal"
|
||||
truncate -s $large_sz "$T_TMP.large"
|
||||
|
||||
echo "== make scratch fs"
|
||||
t_quiet scoutfs mkfs -f -Q 0,127.0.0.1,53000 "$T_EX_META_DEV" "$T_EX_DATA_DEV"
|
||||
SCR="$T_TMPDIR/mnt.scratch"
|
||||
mkdir -p "$SCR"
|
||||
|
||||
echo "== small new data device fails"
|
||||
t_rc scoutfs prepare-empty-data-device "$T_EX_META_DEV" "$T_TMP.small"
|
||||
|
||||
echo "== check sees data device errors"
|
||||
t_rc scoutfs prepare-empty-data-device --check "$T_EX_META_DEV" "$T_TMP.small"
|
||||
t_rc scoutfs prepare-empty-data-device --check "$T_EX_META_DEV"
|
||||
|
||||
echo "== preparing while mounted fails"
|
||||
mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR"
|
||||
t_rc scoutfs prepare-empty-data-device "$T_EX_META_DEV" "$T_TMP.equal"
|
||||
umount "$SCR"
|
||||
|
||||
echo "== preparing without recovery fails"
|
||||
mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR"
|
||||
umount -f "$SCR"
|
||||
t_rc scoutfs prepare-empty-data-device "$T_EX_META_DEV" "$T_TMP.equal"
|
||||
|
||||
echo "== check sees metadata errors"
|
||||
t_rc scoutfs prepare-empty-data-device --check "$T_EX_META_DEV"
|
||||
t_rc scoutfs prepare-empty-data-device --check "$T_EX_META_DEV" "$T_TMP.equal"
|
||||
|
||||
echo "== preparing with file data fails"
|
||||
mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR"
|
||||
echo hi > "$SCR"/file
|
||||
umount "$SCR"
|
||||
scoutfs print "$T_EX_META_DEV" > "$T_TMP.print"
|
||||
t_rc scoutfs prepare-empty-data-device "$T_EX_META_DEV" "$T_TMP.equal"
|
||||
|
||||
echo "== preparing after emptied"
|
||||
mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$T_EX_DATA_DEV" "$SCR"
|
||||
rm -f "$SCR"/file
|
||||
umount "$SCR"
|
||||
t_rc scoutfs prepare-empty-data-device "$T_EX_META_DEV" "$T_TMP.equal"
|
||||
|
||||
echo "== checks pass"
|
||||
t_rc scoutfs prepare-empty-data-device --check "$T_EX_META_DEV"
|
||||
t_rc scoutfs prepare-empty-data-device --check "$T_EX_META_DEV" "$T_TMP.equal"
|
||||
|
||||
echo "== using prepared"
|
||||
scr_loop=$(losetup --find --show "$T_TMP.equal")
|
||||
mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$scr_loop" "$SCR"
|
||||
touch "$SCR"/equal_prepared
|
||||
equal_tot=$(scoutfs statfs -s total_data_blocks -p "$SCR")
|
||||
umount "$SCR"
|
||||
losetup -d "$scr_loop"
|
||||
|
||||
echo "== preparing larger and resizing"
|
||||
t_rc scoutfs prepare-empty-data-device "$T_EX_META_DEV" "$T_TMP.large"
|
||||
scr_loop=$(losetup --find --show "$T_TMP.large")
|
||||
mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 "$scr_loop" "$SCR"
|
||||
touch "$SCR"/large_prepared
|
||||
ls "$SCR"
|
||||
scoutfs resize-devices -p "$SCR" -d $large_sz
|
||||
large_tot=$(scoutfs statfs -s total_data_blocks -p "$SCR")
|
||||
test "$large_tot" -gt "$equal_tot" ; echo "resized larger test rc: $?"
|
||||
umount "$SCR"
|
||||
losetup -d "$scr_loop"
|
||||
|
||||
t_pass
|
||||
@@ -6,6 +6,15 @@
|
||||
#
|
||||
t_require_commands scoutfs stat filefrag dd touch truncate
|
||||
|
||||
write_block()
|
||||
{
|
||||
local file="$1"
|
||||
local blk="$2"
|
||||
|
||||
dd if=/dev/zero of="$file" bs=4096 seek=$blk count=1 conv=notrunc status=none
|
||||
echo "wrote blk $blk"
|
||||
}
|
||||
|
||||
write_forwards()
|
||||
{
|
||||
local prefix="$1"
|
||||
@@ -70,6 +79,25 @@ print_extents_found()
|
||||
filefrag "$prefix"* 2>&1 | grep "extent.*found" | t_filter_fs
|
||||
}
|
||||
|
||||
#
|
||||
# print the logical start, len, and flags if they're there.
|
||||
#
|
||||
print_logical_extents()
|
||||
{
|
||||
local file="$1"
|
||||
|
||||
filefrag -v -b4096 "$file" 2>&1 | t_filter_fs | awk '
|
||||
($1 ~ /[0-9]+:/) {
|
||||
if ($NF !~ /[0-9]+:/) {
|
||||
flags=$NF
|
||||
} else {
|
||||
flags=""
|
||||
}
|
||||
print $2, $6, flags
|
||||
}
|
||||
'
|
||||
}
|
||||
|
||||
t_save_all_sysfs_mount_options data_prealloc_blocks
|
||||
t_save_all_sysfs_mount_options data_prealloc_contig_only
|
||||
restore_options()
|
||||
@@ -133,4 +161,71 @@ t_set_sysfs_mount_option 0 data_prealloc_contig_only 0
|
||||
write_forwards $prefix 3
|
||||
print_extents_found $prefix
|
||||
|
||||
#
|
||||
# prepare aligned regions of 8 blocks that we'll write into.
|
||||
# We'll right into the first, last, and middle block of each
|
||||
# region which was prepared with no existing extents, one at
|
||||
# the start, and one at the end.
|
||||
#
|
||||
# Let's keep this last because it creates a ton of output to read
|
||||
# through. The correct output is tied to preallocation strategy so it
|
||||
# has to be verified each time we change preallocation.
|
||||
#
|
||||
echo "== block writes into region allocs hole"
|
||||
t_set_sysfs_mount_option 0 data_prealloc_blocks 8
|
||||
t_set_sysfs_mount_option 0 data_prealloc_contig_only 1
|
||||
touch "$prefix"
|
||||
truncate -s 0 "$prefix"
|
||||
|
||||
# write initial blocks in regions
|
||||
base=0
|
||||
for sides in 0 1 2 3; do
|
||||
for i in 0 1 2; do
|
||||
case "$sides" in
|
||||
# none
|
||||
0) ;;
|
||||
# left
|
||||
1) write_block $prefix $((base + 0)) ;;
|
||||
# right
|
||||
2) write_block $prefix $((base + 7)) ;;
|
||||
# both
|
||||
3) write_block $prefix $((base + 0))
|
||||
write_block $prefix $((base + 7)) ;;
|
||||
esac
|
||||
((base+=8))
|
||||
done
|
||||
done
|
||||
|
||||
echo before:
|
||||
print_logical_extents "$prefix"
|
||||
|
||||
# now write into the first, middle, and last empty block of each
|
||||
t_set_sysfs_mount_option 0 data_prealloc_contig_only 0
|
||||
base=0
|
||||
for sides in 0 1 2 3; do
|
||||
for i in 0 1 2; do
|
||||
echo "writing into existing $sides at pos $i"
|
||||
case "$sides" in
|
||||
# none
|
||||
0) left=$base; right=$((base + 7));;
|
||||
# left
|
||||
1) left=$((base + 1)); right=$((base + 7));;
|
||||
# right
|
||||
2) left=$((base)); right=$((base + 6));;
|
||||
# both
|
||||
3) left=$((base + 1)); right=$((base + 6));;
|
||||
esac
|
||||
case "$i" in
|
||||
# start
|
||||
0) write_block $prefix $left ;;
|
||||
# end
|
||||
1) write_block $prefix $right ;;
|
||||
# mid (both has 6 blocks internally)
|
||||
2) write_block $prefix $((left + 3)) ;;
|
||||
esac
|
||||
print_logical_extents "$prefix"
|
||||
((base+=8))
|
||||
done
|
||||
done
|
||||
|
||||
t_pass
|
||||
|
||||
@@ -59,7 +59,7 @@ echo "== make small meta fs"
|
||||
# meta device just big enough for reserves and the metadata we'll fill
|
||||
scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m 10G "$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
|
||||
t_fail "mkfs failed"
|
||||
SCR="/mnt/scoutfs.enospc"
|
||||
SCR="$T_TMPDIR/mnt.scratch"
|
||||
mkdir -p "$SCR"
|
||||
mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
|
||||
"$T_EX_DATA_DEV" "$SCR"
|
||||
|
||||
99
tests/tests/get-referring-entries.sh
Normal file
99
tests/tests/get-referring-entries.sh
Normal file
@@ -0,0 +1,99 @@
|
||||
|
||||
#
|
||||
# Test _GET_REFERRING_ENTRIES ioctl via the get-referring-entries cli
|
||||
# command
|
||||
#
|
||||
|
||||
# consistently print only entry names
|
||||
filter_names() {
|
||||
exec cut -d ' ' -f 8- | sort
|
||||
}
|
||||
|
||||
# print entries with type characters to match find. not happy with hard
|
||||
# coding, but abi won't change much.
|
||||
filter_types() {
|
||||
exec cut -d ' ' -f 5- | \
|
||||
sed \
|
||||
-e 's/type 1 /type p /' \
|
||||
-e 's/type 2 /type c /' \
|
||||
-e 's/type 4 /type d /' \
|
||||
-e 's/type 6 /type b /' \
|
||||
-e 's/type 8 /type f /' \
|
||||
-e 's/type 10 /type l /' \
|
||||
-e 's/type 12 /type s /' \
|
||||
| \
|
||||
sort
|
||||
}
|
||||
|
||||
n_chars() {
|
||||
local n="$1"
|
||||
printf 'A%.0s' $(eval echo {1..\$n})
|
||||
}
|
||||
|
||||
GRE="scoutfs get-referring-entries -p $T_M0"
|
||||
|
||||
echo "== root inode returns nothing"
|
||||
$GRE 1
|
||||
|
||||
echo "== crazy large unused inode does nothing"
|
||||
$GRE 4611686018427387904 # 1 << 62
|
||||
|
||||
echo "== basic entry"
|
||||
touch $T_D0/file
|
||||
ino=$(stat -c '%i' $T_D0/file)
|
||||
$GRE $ino | filter_names
|
||||
|
||||
echo "== rename"
|
||||
mv $T_D0/file $T_D0/renamed
|
||||
$GRE $ino | filter_names
|
||||
|
||||
echo "== hard link"
|
||||
mv $T_D0/renamed $T_D0/file
|
||||
ln $T_D0/file $T_D0/link
|
||||
$GRE $ino | filter_names
|
||||
|
||||
echo "== removal"
|
||||
rm $T_D0/file $T_D0/link
|
||||
$GRE $ino
|
||||
|
||||
echo "== different dirs"
|
||||
touch $T_D0/file
|
||||
ino=$(stat -c '%i' $T_D0/file)
|
||||
for i in $(seq 1 10); do
|
||||
mkdir $T_D0/dir-$i
|
||||
ln $T_D0/file $T_D0/dir-$i/file-$i
|
||||
done
|
||||
diff -u <(find $T_D0 -type f -printf '%f\n' | sort) <($GRE $ino | filter_names)
|
||||
rm $T_D0/file
|
||||
|
||||
echo "== file types"
|
||||
mkdir $T_D0/dir
|
||||
touch $T_D0/dir/file
|
||||
mkdir $T_D0/dir/dir
|
||||
ln -s $T_D0/dir/file $T_D0/dir/symlink
|
||||
mknod $T_D0/dir/char c 1 3 # null
|
||||
mknod $T_D0/dir/block b 7 0 # loop0
|
||||
for name in $(ls -UA $T_D0/dir | sort); do
|
||||
ino=$(stat -c '%i' $T_D0/dir/$name)
|
||||
$GRE $ino | filter_types
|
||||
done
|
||||
rm -rf $T_D0/dir
|
||||
|
||||
echo "== all name lengths work"
|
||||
mkdir $T_D0/dir
|
||||
touch $T_D0/dir/file
|
||||
ino=$(stat -c '%i' $T_D0/dir/file)
|
||||
name=""
|
||||
> $T_TMP.unsorted
|
||||
for i in $(seq 1 255); do
|
||||
name+="a"
|
||||
echo "$name" >> $T_TMP.unsorted
|
||||
ln $T_D0/dir/file $T_D0/dir/$name
|
||||
done
|
||||
sort $T_TMP.unsorted > $T_TMP.sorted
|
||||
rm $T_D0/dir/file
|
||||
$GRE $ino | filter_names > $T_TMP.gre
|
||||
diff -u $T_TMP.sorted $T_TMP.gre
|
||||
rm -rf $T_D0/dir
|
||||
|
||||
t_pass
|
||||
16
tests/tests/o_tmpfile.sh
Normal file
16
tests/tests/o_tmpfile.sh
Normal file
@@ -0,0 +1,16 @@
|
||||
#
|
||||
# basic tests of O_TMPFILE
|
||||
#
|
||||
|
||||
t_require_commands stage_tmpfile hexdump
|
||||
|
||||
echo "== non-acl O_TMPFILE creation honors umask"
|
||||
o_tmpfile_umask "$T_D0" "$T_D0/umask-file"
|
||||
|
||||
echo "== stage from tmpfile"
|
||||
DEST_FILE="$T_D0/dest_file"
|
||||
stage_tmpfile $T_D0 $DEST_FILE
|
||||
hexdump -C "$DEST_FILE"
|
||||
rm -f "$DEST_FILE"
|
||||
|
||||
t_pass
|
||||
117
tests/tests/quorum-heartbeat-timeout.sh
Normal file
117
tests/tests/quorum-heartbeat-timeout.sh
Normal file
@@ -0,0 +1,117 @@
|
||||
#
|
||||
# test that the quorum_heartbeat_time_ms option affects how long it
|
||||
# takes to recover from a failed mount.
|
||||
#
|
||||
|
||||
t_require_mounts 2
|
||||
|
||||
time_ms()
|
||||
{
|
||||
# time_t in seconds, then trunate nanoseconds to 3 most dig digits
|
||||
date +%s%3N
|
||||
}
|
||||
|
||||
set_bad_timeout() {
|
||||
local to="$1"
|
||||
t_set_sysfs_mount_option 0 quorum_heartbeat_timeout_ms $to && \
|
||||
t_fail "set bad q hb to $to"
|
||||
}
|
||||
|
||||
set_timeout()
|
||||
{
|
||||
local nr="$1"
|
||||
local how="$2"
|
||||
local to="$3"
|
||||
local is
|
||||
|
||||
if [ $how == "sysfs" ]; then
|
||||
t_set_sysfs_mount_option $nr quorum_heartbeat_timeout_ms $to
|
||||
fi
|
||||
if [ $how == "mount" ]; then
|
||||
t_umount $nr
|
||||
t_mount_opt $nr "quorum_heartbeat_timeout_ms=$to"
|
||||
fi
|
||||
|
||||
is=$(t_get_sysfs_mount_option $nr quorum_heartbeat_timeout_ms)
|
||||
|
||||
if [ "$is" != "$to" ]; then
|
||||
t_fail "tried to set qhbto on $nr via $how to $to but got $is"
|
||||
fi
|
||||
}
|
||||
|
||||
test_timeout()
|
||||
{
|
||||
local how="$1"
|
||||
local to="$2"
|
||||
local start
|
||||
local nr
|
||||
local sv
|
||||
local delay
|
||||
local low
|
||||
local high
|
||||
|
||||
# set timeout on non-server quorum mounts
|
||||
sv=$(t_server_nr)
|
||||
for nr in $(t_quorum_nrs); do
|
||||
if [ $nr -ne $sv ]; then
|
||||
set_timeout $nr $how $to
|
||||
fi
|
||||
done
|
||||
|
||||
# give followers time to recv heartbeats and reset timeouts
|
||||
sleep 1
|
||||
|
||||
# tear down the current server/leader
|
||||
t_force_umount $sv
|
||||
|
||||
# see how long it takes for the next leader to start
|
||||
start=$(time_ms)
|
||||
t_wait_for_leader
|
||||
delay=$(($(time_ms) - start))
|
||||
|
||||
# kind of fun to have these logged
|
||||
echo "to $to delay $delay" >> $T_TMP.delay
|
||||
|
||||
# restore the mount that we tore down
|
||||
t_mount $sv
|
||||
|
||||
# make sure the new leader delay was reasonable, allowing for some slack
|
||||
low=$((to - 1000))
|
||||
high=$((to + 5000))
|
||||
|
||||
# make sure the new leader delay was reasonable
|
||||
test "$delay" -lt "$low" && t_fail "delay $delay < low $low (to $to)"
|
||||
test "$delay" -gt "$high" && t_fail "delay $delay > high $high (to $to)"
|
||||
}
|
||||
|
||||
echo "== bad timeout values fail"
|
||||
set_bad_timeout 0
|
||||
set_bad_timeout -1
|
||||
set_bad_timeout 1000000
|
||||
|
||||
echo "== bad mount option fails"
|
||||
if [ "$(t_server_nr)" == 0 ]; then
|
||||
nr=1
|
||||
else
|
||||
nr=0
|
||||
fi
|
||||
t_umount $nr
|
||||
t_mount_opt $nr "quorum_heartbeat_timeout_ms=1000000" 2>/dev/null && \
|
||||
t_fail "bad mount option succeeded"
|
||||
t_mount $nr
|
||||
|
||||
echo "== mount option"
|
||||
def=$(t_get_sysfs_mount_option 0 quorum_heartbeat_timeout_ms)
|
||||
test_timeout mount $def
|
||||
test_timeout mount 3000
|
||||
test_timeout mount $((def + 19000))
|
||||
|
||||
echo "== sysfs"
|
||||
test_timeout sysfs $def
|
||||
test_timeout sysfs 3000
|
||||
test_timeout sysfs $((def + 19000))
|
||||
|
||||
echo "== reset all options"
|
||||
t_remount_all
|
||||
|
||||
t_pass
|
||||
@@ -73,7 +73,7 @@ echo "== make initial small fs"
|
||||
scoutfs mkfs -A -f -Q 0,127.0.0.1,53000 -m $quarter_meta -d $quarter_data \
|
||||
"$T_EX_META_DEV" "$T_EX_DATA_DEV" > $T_TMP.mkfs.out 2>&1 || \
|
||||
t_fail "mkfs failed"
|
||||
SCR="/mnt/scoutfs.enospc"
|
||||
SCR="$T_TMPDIR/mnt.scratch"
|
||||
mkdir -p "$SCR"
|
||||
mount -t scoutfs -o metadev_path=$T_EX_META_DEV,quorum_slot_nr=0 \
|
||||
"$T_EX_DATA_DEV" "$SCR"
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
#
|
||||
# Run tmpfile_stage and check the output with hexdump.
|
||||
#
|
||||
|
||||
t_require_commands stage_tmpfile hexdump
|
||||
|
||||
DEST_FILE="$T_D0/dest_file"
|
||||
|
||||
stage_tmpfile $T_D0 $DEST_FILE
|
||||
|
||||
hexdump -C "$DEST_FILE"
|
||||
|
||||
rm -fr "$DEST_FILE"
|
||||
|
||||
t_pass
|
||||
@@ -85,6 +85,25 @@ the options directory in the mount's sysfs directory. Writing a new
|
||||
value will cause the next pending orphan scan to be rescheduled
|
||||
with the newly written delay time.
|
||||
.TP
|
||||
.B quorum_heartbeat_timeout_ms=<number>
|
||||
This option sets the amount of time, in milliseconds, that a quorum
|
||||
member will wait without receiving heartbeat messages from the current
|
||||
leader before trying to take over as leader. This setting is per-mount
|
||||
and only changes the behavior of that mount.
|
||||
.sp
|
||||
This determines how long it may take before a failed leader is replaced
|
||||
by a waiting quorum member. Setting it too low may lead to spurious
|
||||
fencing as active leaders are prematurely replaced due to task or
|
||||
network delays that prevent the quorum members from promptly sending and
|
||||
receiving messages. The ideal setting is the longest acceptable
|
||||
downtime during server failover. The default is 10000 (10s) and it can
|
||||
not be less than 2000 greater than 60000.
|
||||
.sp
|
||||
This option can be changed in an active mount by writing to its file in
|
||||
the options directory in the mount's sysfs directory. Writing a new
|
||||
value will take effect the next time the quorum agent receives a
|
||||
heartbeat message and sets the next timeout.
|
||||
.TP
|
||||
.B quorum_slot_nr=<number>
|
||||
The quorum_slot_nr option assigns a quorum member slot to the mount.
|
||||
The mount will use the slot assignment to claim exclusive ownership of
|
||||
|
||||
@@ -76,6 +76,97 @@ run when the file system will not be mounted.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "counters [-t|--table] SYSFS-DIR"
|
||||
.sp
|
||||
Display the counters and their values for a mounted ScoutFS filesystem.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.sp
|
||||
.TP
|
||||
.B SYSFS-DIR
|
||||
The mount's sysfs directory in which to find the
|
||||
.B counters/
|
||||
directory when then contains files for each counter.
|
||||
The sysfs directory is
|
||||
of the form
|
||||
.I /sys/fs/scoutfs/f.<fsid>.r.<rid>/
|
||||
\&.
|
||||
.TP
|
||||
.B "-t, --table"
|
||||
Format the counters into a columnar table that fills the width of the display
|
||||
instead of printing one counter per line.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "data-waiting {-I|--inode} INODE-NUM {-B|--block} BLOCK-NUM [-p|--path PATH]"
|
||||
.sp
|
||||
Display all the files and blocks for which there is a task blocked waiting on
|
||||
offline data.
|
||||
.sp
|
||||
The results are sorted by the file's inode number and the
|
||||
logical block offset that is being waited on.
|
||||
.sp
|
||||
Each line of output describes a block in a file that has a task waiting
|
||||
and is formatted as:
|
||||
.I "ino <nr> iblock <nr> ops [str]"
|
||||
\&. The ops string indicates blocked operations seperated by commas and can
|
||||
include
|
||||
.B read
|
||||
for a read operation,
|
||||
.B write
|
||||
for a write operation, and
|
||||
.B change_size
|
||||
for a truncate or extending write.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.sp
|
||||
.TP
|
||||
.B "-I, --inode INODE-NUM"
|
||||
Start iterating over waiting tasks from the given inode number.
|
||||
Value of 0 will show all waiting tasks.
|
||||
.TP
|
||||
.B "-B, --block BLOCK-NUM"
|
||||
Start iterating over waiting tasks from the given logical block number
|
||||
in the starting inode. Value of 0 will show blocks in the first inode
|
||||
and then continue to show all blocks with tasks waiting in all the
|
||||
remaining inodes.
|
||||
.TP
|
||||
.B "-p, --path PATH"
|
||||
A path within a ScoutFS filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "data-wait-err {-I|--inode} INODE-NUM {-V|--version} VER-NUM {-F|--offset} OFF-NUM {-C|--count} COUNT {-O|--op} OP {-E|--err} ERR [-p|--path PATH]"
|
||||
.sp
|
||||
Return error from matching waiters.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.sp
|
||||
.TP
|
||||
.B "-C, --count COUNT"
|
||||
Count.
|
||||
.TP
|
||||
.B "-E, --err ERR"
|
||||
Error.
|
||||
.TP
|
||||
.B "-F, --offset OFF-NUM"
|
||||
Offset. May be expressed in bytes, or with KMGTP (Kibi, Mibi, etc.) size
|
||||
suffixes.
|
||||
.TP
|
||||
.B "-I, --inode INODE-NUM"
|
||||
Inode number.
|
||||
.TP
|
||||
.B "-O, --op OP"
|
||||
Operation. One of: "read", "write", "change_size".
|
||||
.TP
|
||||
.B "-p, --path PATH"
|
||||
A path within a ScoutFS filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "df [-h|--human-readable] [-p|--path PATH]"
|
||||
.sp
|
||||
@@ -93,6 +184,95 @@ A path within a ScoutFS filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "get-allocated-inos [-i|--ino INO] [-s|--single] [-p|--path PATH]"
|
||||
.sp
|
||||
This debugging command prints allocated inode numbers. It only prints
|
||||
inodes
|
||||
found in the group that contains the starting inode. The printed inode
|
||||
numbers aren't necessarily reachable. They could be anywhere in the
|
||||
process from being unlinked to finally deleted when their items
|
||||
were found.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "-i, --ino INO"
|
||||
The first 64bit inode number which could be printed.
|
||||
.TP
|
||||
.B "-s, --single"
|
||||
Only print the single starting inode when it is allocated, all other allocated
|
||||
inode numbers will be ignored.
|
||||
.TP
|
||||
.B "-p, --path PATH"
|
||||
A path within a ScoutFS filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "get-referring-entries [-p|--path PATH] INO"
|
||||
.sp
|
||||
Find directory entries that reference an inode number.
|
||||
.sp
|
||||
Display all the directory entries that refer to a given inode. Each
|
||||
entry includes the inode number of the directory that contains it, the
|
||||
d_off and d_type values for the entry as described by
|
||||
.BR readdir (3)
|
||||
, and the name of the entry.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.TP
|
||||
.B "-p, --path PATH"
|
||||
A path within a ScoutFS filesystem.
|
||||
.TP
|
||||
.B "INO"
|
||||
The inode number of the target inode.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "ino-path INODE-NUM [-p|--path PATH]"
|
||||
.sp
|
||||
Display all paths that reference an inode number.
|
||||
.sp
|
||||
Ongoing filesystem changes, such as renaming a common parent of multiple paths,
|
||||
can cause displayed paths to be inconsistent.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.sp
|
||||
.TP
|
||||
.B "INODE-NUM"
|
||||
The inode number of the target inode.
|
||||
.TP
|
||||
.B "-p|--path PATH"
|
||||
A path within a ScoutFS filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "list-hidden-xattrs FILE"
|
||||
.sp
|
||||
Display extended attributes starting with the
|
||||
.BR scoutfs.
|
||||
prefix and containing the
|
||||
.BR hide.
|
||||
tag
|
||||
which makes them invisible to
|
||||
.BR listxattr (2) .
|
||||
The names of each attribute are output, one per line. Their order
|
||||
is not specified.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "FILE"
|
||||
The path to a file within a ScoutFS filesystem. File permissions must allow
|
||||
reading.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "mkfs META-DEVICE DATA-DEVICE {-Q|--quorum-slot} NR,ADDR,PORT [-m|--max-meta-size SIZE] [-d|--max-data-size SIZE] [-z|--data-alloc-zone-blocks BLOCKS] [-f|--force] [-A|--allow-small-size] [-V|--format-version VERS]"
|
||||
.sp
|
||||
@@ -171,6 +351,79 @@ The range of supported versions is visible in the output of
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "prepare-empty-data-device {-c|--check} META-DEVICE DATA-DEVICE"
|
||||
.sp
|
||||
Prepare an unused device for use as the data device for an existing file
|
||||
system. This will write an initialized super block to the specified
|
||||
data device, destroying any existing contents. The specified metadata
|
||||
device will not be modified. The file system must be fully unmounted
|
||||
and any client mount recovery must be complete.
|
||||
.sp
|
||||
The existing metadata device is read to ensure that it's safe to stop
|
||||
using the old data device. The data block allocators must indicate that
|
||||
all data blocks are free. If there are still data blocks referenced by
|
||||
files then the command will fail. The contents of these files must be
|
||||
freed for the command to proceed.
|
||||
.sp
|
||||
A new super block is written to the new data device. The device can
|
||||
then be used as the data device to mount the file system. As this
|
||||
switch is made all client mounts must refer to the new device. The old
|
||||
device is not modified and still contains a valid data super block that
|
||||
could be mounted, creating data device writes that wouldn't be read by
|
||||
mounts using the new device.
|
||||
.sp
|
||||
The number of data blocks available to the file system will not change
|
||||
as the new data device is used. The new device must be large enough to
|
||||
store all the data blocks that were available on the old device. If the
|
||||
new device is larger then its added capacity can be used by growing the
|
||||
new data device with the resize-devices command once it is mounted.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "-c, --check"
|
||||
Only check for errors that would prevent a new empty data device from
|
||||
being used. No changes will be made to the data device. If the data
|
||||
device is provided then its size will be checked to make sure that it is
|
||||
large enough. This can be used to test the metadata for data references
|
||||
before destroying an old empty data device.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "print {-S|--skip-likely-huge} META-DEVICE"
|
||||
.sp
|
||||
Prints out all of the metadata in the file system. This makes no effort
|
||||
to ensure that the structures are consistent as they're traversed and
|
||||
can present structures that seem corrupt as they change as they're
|
||||
output.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "-S, --skip-likely-huge"
|
||||
Skip printing structures that are likely to be very large. The
|
||||
structures that are skipped tend to be global and whose size tends to be
|
||||
related to the size of the volume. Examples of skipped structures include
|
||||
the global fs items, srch files, and metadata and data
|
||||
allocators. Similar structures that are not skipped are related to the
|
||||
number of mounts and are maintained at a relatively reasonable size.
|
||||
These include per-mount log trees, srch files, allocators, and the
|
||||
metadata allocators used by server commits.
|
||||
.sp
|
||||
Skipping the larger structures limits the print output to a relatively
|
||||
constant size rather than being a large multiple of the used metadata
|
||||
space of the volume making the output much more useful for inspection.
|
||||
.TP
|
||||
.B "META-DEVICE"
|
||||
The path to the metadata device for the filesystem whose metadata will be
|
||||
printed. An attempt will be made to flush the host's buffer cache for
|
||||
this device with the BLKFLSBUF ioctl, or with posix_fadvise() if
|
||||
the path refers to a regular file.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "resize-devices [-p|--path PATH] [-m|--meta-size SIZE] [-d|--data-size SIZE]"
|
||||
.sp
|
||||
@@ -229,6 +482,92 @@ kibibytes, mebibytes, etc.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "search-xattrs XATTR-NAME [-p|--path PATH]"
|
||||
.sp
|
||||
Display the inode numbers of inodes in the filesystem which may have
|
||||
an extended attribute with the given name.
|
||||
.sp
|
||||
The results may contain false positives. The returned inode numbers
|
||||
should be checked to verify that the extended attribute is in fact
|
||||
present on the inode.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B XATTR-NAME
|
||||
The full name of the extended attribute to search for as
|
||||
described in the
|
||||
.BR xattr (7)
|
||||
manual page.
|
||||
.TP
|
||||
.B "-p|--path PATH"
|
||||
A path within a ScoutFS filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "setattr FILE [-d, --data-version=VERSION [-s, --size=SIZE [-o, --offline]]] [-t, --ctime=TIMESPEC]"
|
||||
.sp
|
||||
Set ScoutFS-specific attributes on a newly created zero-length file.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.sp
|
||||
.TP
|
||||
.B "-V, --data-version=VERSION"
|
||||
Set data version.
|
||||
.TP
|
||||
.B "-o, --offline"
|
||||
Set file contents as offline, not sparse. Requires
|
||||
.I --size
|
||||
option also be present.
|
||||
.TP
|
||||
.B "-s, --size=SIZE"
|
||||
Set file size. May be expressed in bytes, or with
|
||||
KMGTP (Kibi, Mibi, etc.) size suffixes. Requires
|
||||
.I --data-version
|
||||
option also be present.
|
||||
.TP
|
||||
.B "-t, --ctime=TIMESPEC"
|
||||
Set creation time using
|
||||
.I "<seconds-since-epoch>.<nanoseconds>"
|
||||
format.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "stage ARCHIVE-FILE FILE {-V|--version} VERSION [-o, --offset OFF-NUM] [-l, --length LENGTH]"
|
||||
.sp
|
||||
.B Stage
|
||||
(i.e. return to online) the previously-offline contents of a file by copying a
|
||||
region from another file, the archive, and without updating regular inode
|
||||
metadata. Any operations that are blocked by the existence of an offline
|
||||
region will proceed once the region has been staged.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "ARCHIVE-FILE"
|
||||
The source file for the file contents being staged.
|
||||
.TP
|
||||
.B "FILE"
|
||||
The regular file whose contents will be staged.
|
||||
.TP
|
||||
.B "-V, --version VERSION"
|
||||
The data_version of the contents to be staged. It must match the
|
||||
current data_version of the file.
|
||||
.TP
|
||||
.B "-o, --offset OFF-NUM"
|
||||
The starting byte offset of the region to write. May be expressed in bytes, or with
|
||||
KMGTP (Kibi, Mibi, etc.) size suffixes. Default is 0.
|
||||
.TP
|
||||
.B "-l, --length LENGTH"
|
||||
Length of range (bytes or KMGTP units) of file to stage. Default is the file's
|
||||
total size.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "stat FILE [-s|--single-field FIELD-NAME]"
|
||||
.sp
|
||||
Display ScoutFS-specific metadata fields for the given file.
|
||||
@@ -314,221 +653,6 @@ The total number of 4K data blocks in the filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "counters [-t|--table] SYSFS-DIR"
|
||||
.sp
|
||||
Display the counters and their values for a mounted ScoutFS filesystem.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.sp
|
||||
.TP
|
||||
.B SYSFS-DIR
|
||||
The mount's sysfs directory in which to find the
|
||||
.B counters/
|
||||
directory when then contains files for each counter.
|
||||
The sysfs directory is
|
||||
of the form
|
||||
.I /sys/fs/scoutfs/f.<fsid>.r.<rid>/
|
||||
\&.
|
||||
.TP
|
||||
.B "-t, --table"
|
||||
Format the counters into a columnar table that fills the width of the display
|
||||
instead of printing one counter per line.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "search-xattrs XATTR-NAME [-p|--path PATH]"
|
||||
.sp
|
||||
Display the inode numbers of inodes in the filesystem which may have
|
||||
an extended attribute with the given name.
|
||||
.sp
|
||||
The results may contain false positives. The returned inode numbers
|
||||
should be checked to verify that the extended attribute is in fact
|
||||
present on the inode.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B XATTR-NAME
|
||||
The full name of the extended attribute to search for as
|
||||
described in the
|
||||
.BR xattr (7)
|
||||
manual page.
|
||||
.TP
|
||||
.B "-p|--path PATH"
|
||||
A path within a ScoutFS filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "list-hidden-xattrs FILE"
|
||||
.sp
|
||||
Display extended attributes starting with the
|
||||
.BR scoutfs.
|
||||
prefix and containing the
|
||||
.BR hide.
|
||||
tag
|
||||
which makes them invisible to
|
||||
.BR listxattr (2) .
|
||||
The names of each attribute are output, one per line. Their order
|
||||
is not specified.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "FILE"
|
||||
The path to a file within a ScoutFS filesystem. File permissions must allow
|
||||
reading.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "walk-inodes {meta_seq|data_seq} FIRST-INODE LAST-INODE [-p|--path PATH]"
|
||||
.sp
|
||||
Walk an inode index in the file system and output the inode numbers
|
||||
that are found between the first and last positions in the index.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.sp
|
||||
.TP
|
||||
.BR meta_seq , data_seq
|
||||
Which index to walk.
|
||||
.TP
|
||||
.B "FIRST-INODE"
|
||||
An integer index value giving starting position of the index walk.
|
||||
.I 0
|
||||
is the first possible position.
|
||||
.TP
|
||||
.B "LAST-INODE"
|
||||
An integer index value giving the last position to include in the index walk.
|
||||
.I \-1
|
||||
can be given to indicate the last possible position.
|
||||
.TP
|
||||
.B "-p|--path PATH"
|
||||
A path within a ScoutFS filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "ino-path INODE-NUM [-p|--path PATH]"
|
||||
.sp
|
||||
Display all paths that reference an inode number.
|
||||
.sp
|
||||
Ongoing filesystem changes, such as renaming a common parent of multiple paths,
|
||||
can cause displayed paths to be inconsistent.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.sp
|
||||
.TP
|
||||
.B "INODE-NUM"
|
||||
The inode number of the target inode.
|
||||
.TP
|
||||
.B "-p|--path PATH"
|
||||
A path within a ScoutFS filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "data-waiting {-I|--inode} INODE-NUM {-B|--block} BLOCK-NUM [-p|--path PATH]"
|
||||
.sp
|
||||
Display all the files and blocks for which there is a task blocked waiting on
|
||||
offline data.
|
||||
.sp
|
||||
The results are sorted by the file's inode number and the
|
||||
logical block offset that is being waited on.
|
||||
.sp
|
||||
Each line of output describes a block in a file that has a task waiting
|
||||
and is formatted as:
|
||||
.I "ino <nr> iblock <nr> ops [str]"
|
||||
\&. The ops string indicates blocked operations seperated by commas and can
|
||||
include
|
||||
.B read
|
||||
for a read operation,
|
||||
.B write
|
||||
for a write operation, and
|
||||
.B change_size
|
||||
for a truncate or extending write.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.sp
|
||||
.TP
|
||||
.B "-I, --inode INODE-NUM"
|
||||
Start iterating over waiting tasks from the given inode number.
|
||||
Value of 0 will show all waiting tasks.
|
||||
.TP
|
||||
.B "-B, --block BLOCK-NUM"
|
||||
Start iterating over waiting tasks from the given logical block number
|
||||
in the starting inode. Value of 0 will show blocks in the first inode
|
||||
and then continue to show all blocks with tasks waiting in all the
|
||||
remaining inodes.
|
||||
.TP
|
||||
.B "-p, --path PATH"
|
||||
A path within a ScoutFS filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "data-wait-err {-I|--inode} INODE-NUM {-V|--version} VER-NUM {-F|--offset} OFF-NUM {-C|--count} COUNT {-O|--op} OP {-E|--err} ERR [-p|--path PATH]"
|
||||
.sp
|
||||
Return error from matching waiters.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.sp
|
||||
.TP
|
||||
.B "-C, --count COUNT"
|
||||
Count.
|
||||
.TP
|
||||
.B "-E, --err ERR"
|
||||
Error.
|
||||
.TP
|
||||
.B "-F, --offset OFF-NUM"
|
||||
Offset. May be expressed in bytes, or with KMGTP (Kibi, Mibi, etc.) size
|
||||
suffixes.
|
||||
.TP
|
||||
.B "-I, --inode INODE-NUM"
|
||||
Inode number.
|
||||
.TP
|
||||
.B "-O, --op OP"
|
||||
Operation. One of: "read", "write", "change_size".
|
||||
.TP
|
||||
.B "-p, --path PATH"
|
||||
A path within a ScoutFS filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "stage ARCHIVE-FILE FILE {-V|--version} VERSION [-o, --offset OFF-NUM] [-l, --length LENGTH]"
|
||||
.sp
|
||||
.B Stage
|
||||
(i.e. return to online) the previously-offline contents of a file by copying a
|
||||
region from another file, the archive, and without updating regular inode
|
||||
metadata. Any operations that are blocked by the existence of an offline
|
||||
region will proceed once the region has been staged.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "ARCHIVE-FILE"
|
||||
The source file for the file contents being staged.
|
||||
.TP
|
||||
.B "FILE"
|
||||
The regular file whose contents will be staged.
|
||||
.TP
|
||||
.B "-V, --version VERSION"
|
||||
The data_version of the contents to be staged. It must match the
|
||||
current data_version of the file.
|
||||
.TP
|
||||
.B "-o, --offset OFF-NUM"
|
||||
The starting byte offset of the region to write. May be expressed in bytes, or with
|
||||
KMGTP (Kibi, Mibi, etc.) size suffixes. Default is 0.
|
||||
.TP
|
||||
.B "-l, --length LENGTH"
|
||||
Length of range (bytes or KMGTP units) of file to stage. Default is the file's
|
||||
total size.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "release FILE {-V|--version} VERSION [-o, --offset OFF-NUM] [-l, --length LENGTH]"
|
||||
.sp
|
||||
@@ -568,90 +692,28 @@ total size.
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "setattr FILE [-d, --data-version=VERSION [-s, --size=SIZE [-o, --offline]]] [-t, --ctime=TIMESPEC]"
|
||||
.BI "walk-inodes {meta_seq|data_seq} FIRST-INODE LAST-INODE [-p|--path PATH]"
|
||||
.sp
|
||||
Set ScoutFS-specific attributes on a newly created zero-length file.
|
||||
Walk an inode index in the file system and output the inode numbers
|
||||
that are found between the first and last positions in the index.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.sp
|
||||
.TP
|
||||
.B "-V, --data-version=VERSION"
|
||||
Set data version.
|
||||
.BR meta_seq , data_seq
|
||||
Which index to walk.
|
||||
.TP
|
||||
.B "-o, --offline"
|
||||
Set file contents as offline, not sparse. Requires
|
||||
.I --size
|
||||
option also be present.
|
||||
.B "FIRST-INODE"
|
||||
An integer index value giving starting position of the index walk.
|
||||
.I 0
|
||||
is the first possible position.
|
||||
.TP
|
||||
.B "-s, --size=SIZE"
|
||||
Set file size. May be expressed in bytes, or with
|
||||
KMGTP (Kibi, Mibi, etc.) size suffixes. Requires
|
||||
.I --data-version
|
||||
option also be present.
|
||||
.B "LAST-INODE"
|
||||
An integer index value giving the last position to include in the index walk.
|
||||
.I \-1
|
||||
can be given to indicate the last possible position.
|
||||
.TP
|
||||
.B "-t, --ctime=TIMESPEC"
|
||||
Set creation time using
|
||||
.I "<seconds-since-epoch>.<nanoseconds>"
|
||||
format.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "print {-S|--skip-likely-huge} META-DEVICE"
|
||||
.sp
|
||||
Prints out all of the metadata in the file system. This makes no effort
|
||||
to ensure that the structures are consistent as they're traversed and
|
||||
can present structures that seem corrupt as they change as they're
|
||||
output.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "-S, --skip-likely-huge"
|
||||
Skip printing structures that are likely to be very large. The
|
||||
structures that are skipped tend to be global and whose size tends to be
|
||||
related to the size of the volume. Examples of skipped structures include
|
||||
the global fs items, srch files, and metadata and data
|
||||
allocators. Similar structures that are not skipped are related to the
|
||||
number of mounts and are maintained at a relatively reasonable size.
|
||||
These include per-mount log trees, srch files, allocators, and the
|
||||
metadata allocators used by server commits.
|
||||
.sp
|
||||
Skipping the larger structures limits the print output to a relatively
|
||||
constant size rather than being a large multiple of the used metadata
|
||||
space of the volume making the output much more useful for inspection.
|
||||
.TP
|
||||
.B "META-DEVICE"
|
||||
The path to the metadata device for the filesystem whose metadata will be
|
||||
printed. Since this command reads via the host's buffer cache, it may not
|
||||
reflect the current blocks in the filesystem possibly written to the shared
|
||||
block devices from another host, unless
|
||||
.B blockdev \--flushbufs
|
||||
command is used first.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "get-allocated-inos [-i|--ino INO] [-s|--single] [-p|--path PATH]"
|
||||
.sp
|
||||
This debugging command prints allocated inode numbers. It only prints
|
||||
inodes
|
||||
found in the group that contains the starting inode. The printed inode
|
||||
numbers aren't necessarily reachable. They could be anywhere in the
|
||||
process from being unlinked to finally deleted when their items
|
||||
were found.
|
||||
.RS 1.0i
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "-i, --ino INO"
|
||||
The first 64bit inode number which could be printed.
|
||||
.TP
|
||||
.B "-s, --single"
|
||||
Only print the single starting inode when it is allocated, all other allocated
|
||||
inode numbers will be ignored.
|
||||
.TP
|
||||
.B "-p, --path PATH"
|
||||
.B "-p|--path PATH"
|
||||
A path within a ScoutFS filesystem.
|
||||
.RE
|
||||
.PD
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <linux/fs.h>
|
||||
#include <errno.h>
|
||||
@@ -11,13 +12,10 @@
|
||||
#include "sparse.h"
|
||||
#include "dev.h"
|
||||
|
||||
int device_size(char *path, int fd,
|
||||
u64 min_size, u64 max_size, bool allow_small_size,
|
||||
char *use_type, u64 *size_ret)
|
||||
int get_device_size(char *path, int fd, u64 *size_ret)
|
||||
{
|
||||
struct stat st;
|
||||
u64 size;
|
||||
char *target_type;
|
||||
int ret;
|
||||
|
||||
if (fstat(fd, &st)) {
|
||||
@@ -29,7 +27,6 @@ int device_size(char *path, int fd,
|
||||
|
||||
if (S_ISREG(st.st_mode)) {
|
||||
size = st.st_size;
|
||||
target_type = "file";
|
||||
} else if (S_ISBLK(st.st_mode)) {
|
||||
if (ioctl(fd, BLKGETSIZE64, &size)) {
|
||||
ret = -errno;
|
||||
@@ -37,13 +34,26 @@ int device_size(char *path, int fd,
|
||||
path, strerror(errno), errno);
|
||||
return ret;
|
||||
}
|
||||
target_type = "device";
|
||||
} else {
|
||||
fprintf(stderr, "path isn't regular or device file '%s'\n",
|
||||
path);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
*size_ret = size;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int limit_device_size(char *path, int fd, u64 min_size, u64 max_size, bool allow_small_size,
|
||||
char *use_type, u64 *size_ret)
|
||||
{
|
||||
u64 size;
|
||||
int ret;
|
||||
|
||||
ret = get_device_size(path, fd, &size);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (max_size) {
|
||||
if (size > max_size) {
|
||||
printf("Limiting use of "BASE_SIZE_FMT
|
||||
@@ -63,9 +73,9 @@ int device_size(char *path, int fd,
|
||||
|
||||
if (size < min_size) {
|
||||
fprintf(stderr,
|
||||
BASE_SIZE_FMT" %s too small for min "
|
||||
BASE_SIZE_FMT" too small for min "
|
||||
BASE_SIZE_FMT" %s device%s\n",
|
||||
BASE_SIZE_ARGS(size), target_type,
|
||||
BASE_SIZE_ARGS(size),
|
||||
BASE_SIZE_ARGS(min_size), use_type,
|
||||
allow_small_size ? ", allowing with -A" : "");
|
||||
|
||||
@@ -103,3 +113,44 @@ char *size_str(u64 nr, unsigned size)
|
||||
|
||||
return suffixes[i];
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to flush the local read cache for a device. This is only a best
|
||||
* effort as these interfaces don't block waiting to fully purge the
|
||||
* cache. This is OK because it's used by cached readers that are known
|
||||
* to be racy anyway.
|
||||
*/
|
||||
int flush_device(int fd)
|
||||
{
|
||||
struct stat st;
|
||||
int ret;
|
||||
|
||||
ret = fstat(fd, &st);
|
||||
if (ret < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "fstat failed: %s (%d)\n", strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (S_ISREG(st.st_mode)) {
|
||||
ret = posix_fadvise(fd, 0, st.st_size, POSIX_FADV_DONTNEED);
|
||||
if (ret < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "POSIX_FADV_DONTNEED failed: %s (%d)\n",
|
||||
strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
} else if (S_ISBLK(st.st_mode)) {
|
||||
ret = ioctl(fd, BLKFLSBUF, 0);
|
||||
if (ret < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "BLKFLSBUF, failed: %s (%d)\n", strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
@@ -9,10 +9,11 @@
|
||||
#define SIZE_FMT "%llu (%.2f %s)"
|
||||
#define SIZE_ARGS(nr, sz) (nr), size_flt(nr, sz), size_str(nr, sz)
|
||||
|
||||
int device_size(char *path, int fd,
|
||||
u64 min_size, u64 max_size, bool allow_small_size,
|
||||
char *use_type, u64 *size_ret);
|
||||
int get_device_size(char *path, int fd, u64 *size_ret);
|
||||
int limit_device_size(char *path, int fd, u64 min_size, u64 max_size, bool allow_small_size,
|
||||
char *use_type, u64 *size_ret);
|
||||
float size_flt(u64 nr, unsigned size);
|
||||
char *size_str(u64 nr, unsigned size);
|
||||
int flush_device(int fd);
|
||||
|
||||
#endif
|
||||
|
||||
150
utils/src/get_referring_entries.c
Normal file
150
utils/src/get_referring_entries.c
Normal file
@@ -0,0 +1,150 @@
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include <argp.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "parse.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "ioctl.h"
|
||||
#include "parse.h"
|
||||
#include "cmd.h"
|
||||
|
||||
struct gre_args {
|
||||
char *path;
|
||||
u64 ino;
|
||||
};
|
||||
|
||||
static int do_get_referring_entries(struct gre_args *args)
|
||||
{
|
||||
struct scoutfs_ioctl_get_referring_entries gre;
|
||||
struct scoutfs_ioctl_dirent *dent;
|
||||
unsigned int bytes;
|
||||
void *buf;
|
||||
int ret;
|
||||
int fd;
|
||||
|
||||
fd = get_path(args->path, O_RDONLY);
|
||||
if (fd < 0)
|
||||
return fd;
|
||||
|
||||
bytes = PATH_MAX * 1024;
|
||||
buf = malloc(bytes);
|
||||
if (!buf) {
|
||||
fprintf(stderr, "couldn't allocate %u byte buffer\n", bytes);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
gre.ino = args->ino;
|
||||
gre.dir_ino = 0;
|
||||
gre.dir_pos = 0;
|
||||
gre.entries_ptr = (intptr_t)buf;
|
||||
gre.entries_bytes = bytes;
|
||||
|
||||
for (;;) {
|
||||
ret = ioctl(fd, SCOUTFS_IOC_GET_REFERRING_ENTRIES, &gre);
|
||||
if (ret <= 0) {
|
||||
if (ret < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "ioctl failed: %s (%d)\n", strerror(errno), errno);
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
|
||||
dent = buf;
|
||||
while (ret-- > 0) {
|
||||
printf("dir %llu pos %llu type %u name %s\n",
|
||||
dent->dir_ino, dent->dir_pos, dent->d_type, dent->name);
|
||||
|
||||
gre.dir_ino = dent->dir_ino;
|
||||
gre.dir_pos = dent->dir_pos;
|
||||
|
||||
if (dent->flags & SCOUTFS_IOCTL_DIRENT_FLAG_LAST) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
dent = (void *)dent + dent->entry_bytes;
|
||||
}
|
||||
|
||||
if (++gre.dir_pos == 0) {
|
||||
if (++gre.dir_ino == 0) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
close(fd);
|
||||
free(buf);
|
||||
|
||||
return ret;
|
||||
};
|
||||
|
||||
static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
{
|
||||
struct gre_args *args = state->input;
|
||||
int ret;
|
||||
|
||||
switch (key) {
|
||||
case 'p':
|
||||
args->path = strdup_or_error(state, arg);
|
||||
break;
|
||||
case ARGP_KEY_ARG:
|
||||
if (args->ino)
|
||||
argp_error(state, "more than one argument given");
|
||||
ret = parse_u64(arg, &args->ino);
|
||||
if (ret)
|
||||
argp_error(state, "inode parse error");
|
||||
break;
|
||||
case ARGP_KEY_FINI:
|
||||
if (!args->ino) {
|
||||
argp_error(state, "must provide inode number");
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct argp_option options[] = {
|
||||
{ "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"},
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
static struct argp argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
"INODE-NUM",
|
||||
"Print directory entries that refer to inode number"
|
||||
};
|
||||
|
||||
static int get_referring_entries_cmd(int argc, char **argv)
|
||||
{
|
||||
struct gre_args args = {NULL};
|
||||
int ret;
|
||||
|
||||
ret = argp_parse(&argp, argc, argv, 0, NULL, &args);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return do_get_referring_entries(&args);
|
||||
}
|
||||
|
||||
|
||||
static void __attribute__((constructor)) get_referring_entries_ctor(void)
|
||||
{
|
||||
cmd_register_argp("get-referring-entries", &argp, GROUP_SEARCH, get_referring_entries_cmd);
|
||||
}
|
||||
@@ -118,6 +118,33 @@ struct mkfs_args {
|
||||
struct scoutfs_quorum_slot slots[SCOUTFS_QUORUM_MAX_SLOTS];
|
||||
};
|
||||
|
||||
static int open_mkfs_dev(struct mkfs_args *args, char *path, mode_t mode, char *which)
|
||||
{
|
||||
int ret;
|
||||
int fd = -1;
|
||||
|
||||
fd = open(path, mode);
|
||||
if (fd < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "failed to open %s dev '%s': %s (%d)\n",
|
||||
which, path, strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = flush_device(fd);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
if (!args->force)
|
||||
ret = check_bdev(fd, path, which);
|
||||
|
||||
out:
|
||||
if (ret < 0 && fd >= 0)
|
||||
close(fd);
|
||||
|
||||
return ret ?: fd;
|
||||
}
|
||||
|
||||
/*
|
||||
* Make a new file system by writing:
|
||||
* - super blocks
|
||||
@@ -156,32 +183,17 @@ static int do_mkfs(struct mkfs_args *args)
|
||||
gettimeofday(&tv, NULL);
|
||||
pseudo_random_bytes(&fsid, sizeof(fsid));
|
||||
|
||||
meta_fd = open(args->meta_device, O_RDWR | O_EXCL);
|
||||
meta_fd = open_mkfs_dev(args, args->meta_device, O_RDWR | O_EXCL, "meta");
|
||||
if (meta_fd < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "failed to open '%s': %s (%d)\n",
|
||||
args->meta_device, strerror(errno), errno);
|
||||
ret = meta_fd;
|
||||
goto out;
|
||||
}
|
||||
if (!args->force) {
|
||||
ret = check_bdev(meta_fd, args->meta_device, "meta");
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
data_fd = open(args->data_device, O_RDWR | O_EXCL);
|
||||
data_fd = open_mkfs_dev(args, args->data_device, O_RDWR | O_EXCL, "data");
|
||||
if (data_fd < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "failed to open '%s': %s (%d)\n",
|
||||
args->data_device, strerror(errno), errno);
|
||||
ret = data_fd;
|
||||
goto out;
|
||||
}
|
||||
if (!args->force) {
|
||||
ret = check_bdev(data_fd, args->data_device, "data");
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
super = calloc(1, SCOUTFS_BLOCK_SM_SIZE);
|
||||
bt = calloc(1, SCOUTFS_BLOCK_LG_SIZE);
|
||||
@@ -194,14 +206,14 @@ static int do_mkfs(struct mkfs_args *args)
|
||||
}
|
||||
|
||||
/* minumum meta device size to make reserved blocks reasonably large */
|
||||
ret = device_size(args->meta_device, meta_fd, 64ULL * (1024 * 1024 * 1024),
|
||||
args->max_meta_size, args->allow_small_size, "meta", &meta_size);
|
||||
ret = limit_device_size(args->meta_device, meta_fd, 64ULL * (1024 * 1024 * 1024),
|
||||
args->max_meta_size, args->allow_small_size, "meta", &meta_size);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/* .. then arbitrarily the same minimum data device size */
|
||||
ret = device_size(args->data_device, data_fd, 64ULL * (1024 * 1024 * 1024),
|
||||
args->max_data_size, args->allow_small_size, "data", &data_size);
|
||||
ret = limit_device_size(args->data_device, data_fd, 64ULL * (1024 * 1024 * 1024),
|
||||
args->max_data_size, args->allow_small_size, "data", &data_size);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
|
||||
247
utils/src/prepare_empty_data_device.c
Normal file
247
utils/src/prepare_empty_data_device.c
Normal file
@@ -0,0 +1,247 @@
|
||||
#define _GNU_SOURCE /* O_DIRECT */
|
||||
#include <unistd.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <sys/time.h>
|
||||
#include <uuid/uuid.h>
|
||||
#include <fcntl.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#include <assert.h>
|
||||
#include <sys/socket.h>
|
||||
#include <netinet/in.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <ctype.h>
|
||||
#include <inttypes.h>
|
||||
#include <argp.h>
|
||||
|
||||
#include "sparse.h"
|
||||
#include "cmd.h"
|
||||
#include "util.h"
|
||||
#include "format.h"
|
||||
#include "parse.h"
|
||||
#include "crc.h"
|
||||
#include "rand.h"
|
||||
#include "dev.h"
|
||||
#include "key.h"
|
||||
#include "bitops.h"
|
||||
#include "btree.h"
|
||||
#include "leaf_item_hash.h"
|
||||
#include "blkid.h"
|
||||
#include "quorum.h"
|
||||
|
||||
struct prepare_empty_data_dev_args {
|
||||
char *meta_device;
|
||||
char *data_device;
|
||||
bool check;
|
||||
};
|
||||
|
||||
static int do_prepare_empty_data_dev(struct prepare_empty_data_dev_args *args)
|
||||
{
|
||||
struct scoutfs_super_block *meta_super = NULL;
|
||||
struct scoutfs_super_block *data_super = NULL;
|
||||
char uuid_str[37];
|
||||
int meta_fd = -1;
|
||||
int data_fd = -1;
|
||||
u64 data_blocks;
|
||||
u64 data_size;
|
||||
u64 in_use;
|
||||
int ret;
|
||||
|
||||
ret = posix_memalign((void **)&data_super, SCOUTFS_BLOCK_SM_SIZE, SCOUTFS_BLOCK_SM_SIZE);
|
||||
if (ret < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "failed to allocate data super block: %s (%d)\n",
|
||||
strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
meta_fd = open(args->meta_device, O_DIRECT | O_SYNC | O_RDONLY | O_EXCL);
|
||||
if (meta_fd < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "failed to open meta device '%s': %s (%d)\n",
|
||||
args->meta_device, strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = read_block_verify(meta_fd, SCOUTFS_BLOCK_MAGIC_SUPER, 0, SCOUTFS_SUPER_BLKNO,
|
||||
SCOUTFS_BLOCK_SM_SHIFT, (void **)&meta_super);
|
||||
if (ret) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "failed to read meta super block: %s (%d)\n",
|
||||
strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = meta_super_in_use(meta_fd, meta_super);
|
||||
if (ret < 0) {
|
||||
if (ret == -EBUSY)
|
||||
fprintf(stderr, "The filesystem must be fully recovered and cleanly unmounted to determine if the data device is empty.\n");
|
||||
goto out;
|
||||
}
|
||||
|
||||
in_use = (le64_to_cpu(meta_super->total_data_blocks) - SCOUTFS_DATA_DEV_START_BLKNO) -
|
||||
le64_to_cpu(meta_super->data_alloc.total_len);
|
||||
if (in_use) {
|
||||
fprintf(stderr, "Data block allocator metadata shows "SIZE_FMT" data blocks used by files. They must be removed, truncated, or released before a new empty data device can be used.\n",
|
||||
SIZE_ARGS(in_use, SCOUTFS_BLOCK_SM_SIZE));
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (args->data_device) {
|
||||
data_fd = open(args->data_device, O_DIRECT | O_EXCL |
|
||||
(args->check ? O_RDONLY : O_RDWR | O_SYNC));
|
||||
if (data_fd < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "failed to open data device '%s': %s (%d)\n",
|
||||
args->data_device, strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = get_device_size(args->data_device, data_fd, &data_size);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
data_blocks = data_size >> SCOUTFS_BLOCK_SM_SHIFT;
|
||||
|
||||
if (data_blocks < le64_to_cpu(meta_super->total_data_blocks)) {
|
||||
fprintf(stderr, "new data device %s of size "BASE_SIZE_FMT" has %llu 4KiB blocks, it needs at least "SIZE_FMT" blocks.\n",
|
||||
args->data_device,
|
||||
BASE_SIZE_ARGS(data_size),
|
||||
data_blocks,
|
||||
SIZE_ARGS(le64_to_cpu(meta_super->total_data_blocks),
|
||||
SCOUTFS_BLOCK_SM_SIZE));
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
if (args->check) {
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* the data device superblock only needs fs identifying fields */
|
||||
memset(data_super, 0, sizeof(struct scoutfs_super_block));
|
||||
data_super->id = meta_super->id;
|
||||
data_super->fmt_vers = meta_super->fmt_vers;
|
||||
data_super->flags = meta_super->flags &~ cpu_to_le64(SCOUTFS_FLAG_IS_META_BDEV);
|
||||
memcpy(data_super->uuid, meta_super->uuid,sizeof(data_super->uuid));
|
||||
data_super->seq = meta_super->seq;
|
||||
data_super->total_meta_blocks = meta_super->total_meta_blocks;
|
||||
data_super->total_data_blocks = meta_super->total_data_blocks;
|
||||
|
||||
ret = write_block(data_fd, SCOUTFS_BLOCK_MAGIC_SUPER, meta_super->hdr.fsid, 1,
|
||||
SCOUTFS_SUPER_BLKNO, SCOUTFS_BLOCK_SM_SHIFT, &data_super->hdr);
|
||||
if (ret < 0) {
|
||||
ret = -errno;
|
||||
fprintf(stderr, "Error writing super block to new data device '%s': %s (%d)\n",
|
||||
args->data_device, strerror(errno), errno);
|
||||
goto out;
|
||||
}
|
||||
|
||||
uuid_unparse(meta_super->uuid, uuid_str);
|
||||
|
||||
printf("Successfully initialized empty data device for scoutfs filesystem:\n"
|
||||
" meta device path: %s\n"
|
||||
" data device path: %s\n"
|
||||
" fsid: %llx\n"
|
||||
" uuid: %s\n"
|
||||
" format version: %llu\n"
|
||||
" 64KB metadata blocks: "SIZE_FMT"\n"
|
||||
" 4KB data blocks: "SIZE_FMT"\n",
|
||||
args->meta_device,
|
||||
args->data_device,
|
||||
le64_to_cpu(meta_super->hdr.fsid),
|
||||
uuid_str,
|
||||
le64_to_cpu(meta_super->fmt_vers),
|
||||
SIZE_ARGS(le64_to_cpu(meta_super->total_meta_blocks),
|
||||
SCOUTFS_BLOCK_LG_SIZE),
|
||||
SIZE_ARGS(le64_to_cpu(meta_super->total_data_blocks),
|
||||
SCOUTFS_BLOCK_SM_SIZE));
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
if (args->check) {
|
||||
if (ret == 0)
|
||||
printf("All checks passed.\n");
|
||||
else
|
||||
printf("Errors were found that must be addressed before a new empty data device could be prepared and used.\n");
|
||||
}
|
||||
|
||||
if (meta_super)
|
||||
free(meta_super);
|
||||
if (data_super)
|
||||
free(data_super);
|
||||
if (meta_fd != -1)
|
||||
close(meta_fd);
|
||||
if (data_fd != -1)
|
||||
close(data_fd);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
{
|
||||
struct prepare_empty_data_dev_args *args = state->input;
|
||||
|
||||
switch (key) {
|
||||
case 'c':
|
||||
args->check = true;
|
||||
break;
|
||||
case ARGP_KEY_ARG:
|
||||
if (!args->meta_device)
|
||||
args->meta_device = strdup_or_error(state, arg);
|
||||
else if (!args->data_device)
|
||||
args->data_device = strdup_or_error(state, arg);
|
||||
else
|
||||
argp_error(state, "more than two device arguments given");
|
||||
break;
|
||||
case ARGP_KEY_FINI:
|
||||
if (!args->meta_device)
|
||||
argp_error(state, "no metadata device argument given");
|
||||
if (!args->data_device && !args->check)
|
||||
argp_error(state, "no data device argument given");
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct argp_option options[] = {
|
||||
{ "check", 'c', NULL, 0, "Only check for errors and do not write", },
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
static struct argp argp = {
|
||||
options,
|
||||
parse_opt,
|
||||
"META-DEVICE DATA-DEVICE",
|
||||
"Prepare empty data device for use with an existing ScoutFS filesystem"
|
||||
};
|
||||
|
||||
static int prepare_empty_data_dev_cmd(int argc, char *argv[])
|
||||
{
|
||||
struct prepare_empty_data_dev_args prepare_empty_data_dev_args = {
|
||||
.check = false,
|
||||
};
|
||||
int ret;
|
||||
|
||||
ret = argp_parse(&argp, argc, argv, 0, NULL, &prepare_empty_data_dev_args);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return do_prepare_empty_data_dev(&prepare_empty_data_dev_args);
|
||||
}
|
||||
|
||||
static void __attribute__((constructor)) prepare_empty_data_dev_ctor(void)
|
||||
{
|
||||
cmd_register_argp("prepare-empty-data-device", &argp, GROUP_CORE,
|
||||
prepare_empty_data_dev_cmd);
|
||||
}
|
||||
@@ -27,6 +27,7 @@
|
||||
#include "avl.h"
|
||||
#include "srch.h"
|
||||
#include "leaf_item_hash.h"
|
||||
#include "dev.h"
|
||||
|
||||
static void print_block_header(struct scoutfs_block_header *hdr, int size)
|
||||
{
|
||||
@@ -1107,7 +1108,12 @@ static int do_print(struct print_args *args)
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = flush_device(fd);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
ret = print_volume(fd, args);
|
||||
out:
|
||||
close(fd);
|
||||
return ret;
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user