mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-09 13:23:14 +00:00
Compare commits
22 Commits
zab/cluste
...
zab/v1.7
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f13aba78b1 | ||
|
|
3220c2055c | ||
|
|
1cbc927ccb | ||
|
|
acb94dd9b7 | ||
|
|
233fbb39f3 | ||
|
|
198d3cda32 | ||
|
|
e8c64b4217 | ||
|
|
89b64ae1f7 | ||
|
|
fc8a5a1b5c | ||
|
|
d4c793e010 | ||
|
|
8a3058818c | ||
|
|
ba9a106f72 | ||
|
|
310725eb72 | ||
|
|
51a8236316 | ||
|
|
f3dd00895b | ||
|
|
49df98f5a8 | ||
|
|
15cf3c4134 | ||
|
|
1abe97351d | ||
|
|
f757e29915 | ||
|
|
31e474c5fa | ||
|
|
dcf8202d7c | ||
|
|
ae55fa3153 |
@@ -1,6 +1,64 @@
|
||||
Versity ScoutFS Release Notes
|
||||
=============================
|
||||
|
||||
---
|
||||
v1.7
|
||||
\
|
||||
*Aug 26, 2022*
|
||||
|
||||
* **Fixed possible persistent errors moving freed data extents**
|
||||
\
|
||||
Fixed a case where the server could hit persistent errors trying to
|
||||
move a client's freed extents in one commit. The client had to free
|
||||
a large number of extents that occupied distant positions in the
|
||||
global free extent btree. Very large fragmented files could cause
|
||||
this. The server now moves the freed extents in multiple commits and
|
||||
can always ensure forward progress.
|
||||
|
||||
* **Fixed possible persistent errors from freed duplicate extents**
|
||||
\
|
||||
Background orphan deletion wasn't properly synchronizing with
|
||||
foreground tasks deleting very large files. If a deletion took long
|
||||
enough then background deletion could also attempt to delete inode items
|
||||
while the deletion was making progress. This could create duplicate
|
||||
deletions of data extent items which causes the server to abort when
|
||||
it later discovers the duplicate extents as it merges free lists.
|
||||
|
||||
---
|
||||
v1.6
|
||||
\
|
||||
*Jul 7, 2022*
|
||||
|
||||
* **Fix memory leaks in rare corner cases**
|
||||
\
|
||||
Analysis tools found a few corner cases that leaked small structures,
|
||||
generally around error handling or startup and shutdown.
|
||||
|
||||
* **Add --skip-likely-huge scoutfs print command option**
|
||||
\
|
||||
Add an option to scoutfs print to reduce the size of the output
|
||||
so that it can be used to see system-wide metadata without being
|
||||
overwhelmed by file-level details.
|
||||
|
||||
---
|
||||
v1.5
|
||||
\
|
||||
*Jun 21, 2022*
|
||||
|
||||
* **Fix persistent error during server startup**
|
||||
\
|
||||
Fixed a case where the server would always hit a consistent error on
|
||||
seartup, preventing the system from mounting. This required a rare
|
||||
but valid state across the clients.
|
||||
|
||||
* **Fix a client hang that would lead to fencing**
|
||||
\
|
||||
The client module's use of in-kernel networking was missing annotation
|
||||
that could lead to communication hanging. The server would fence the
|
||||
client when it stopped communicating. This could be identified by the
|
||||
server fencing a client after it disconnected with no attempt by the
|
||||
client to reconnect.
|
||||
|
||||
---
|
||||
v1.4
|
||||
\
|
||||
|
||||
@@ -892,12 +892,11 @@ static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *r
|
||||
* -ENOENT is returned if we run out of extents in the source tree
|
||||
* before moving the total.
|
||||
*
|
||||
* If meta_reserved is non-zero then -EINPROGRESS can be returned if the
|
||||
* current meta allocator's avail blocks or room for freed blocks would
|
||||
* have fallen under the reserved amount. The could have been
|
||||
* successfully dirtied in this case but the number of blocks moved is
|
||||
* not returned. The caller is expected to deal with the partial
|
||||
* progress by commiting the dirty trees and examining the resulting
|
||||
* If meta_budget is non-zero then -EINPROGRESS can be returned if the
|
||||
* the caller's budget is consumed in the allocator during this call
|
||||
* (though not necessarily by us, we don't have per-thread tracking of
|
||||
* allocator consumption :/). The call can still have made progress and
|
||||
* caller is expected commit the dirty trees and examining the resulting
|
||||
* modified trees to see if they need to continue moving extents.
|
||||
*
|
||||
* The caller can specify that extents in the source tree should first
|
||||
@@ -914,7 +913,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 total,
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved)
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_budget)
|
||||
{
|
||||
struct alloc_ext_args args = {
|
||||
.alloc = alloc,
|
||||
@@ -922,6 +921,8 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
};
|
||||
struct scoutfs_extent found;
|
||||
struct scoutfs_extent ext;
|
||||
u32 avail_start = 0;
|
||||
u32 freed_start = 0;
|
||||
u64 moved = 0;
|
||||
u64 count;
|
||||
int ret = 0;
|
||||
@@ -932,6 +933,9 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
vacant = NULL;
|
||||
}
|
||||
|
||||
if (meta_budget != 0)
|
||||
scoutfs_alloc_meta_remaining(alloc, &avail_start, &freed_start);
|
||||
|
||||
while (moved < total) {
|
||||
count = total - moved;
|
||||
|
||||
@@ -964,10 +968,10 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
if (meta_reserved != 0 &&
|
||||
scoutfs_alloc_meta_low(sb, alloc, meta_reserved +
|
||||
extent_mod_blocks(src->root.height) +
|
||||
extent_mod_blocks(dst->root.height))) {
|
||||
if (meta_budget != 0 &&
|
||||
scoutfs_alloc_meta_low_since(alloc, avail_start, freed_start, meta_budget,
|
||||
extent_mod_blocks(src->root.height) +
|
||||
extent_mod_blocks(dst->root.height))) {
|
||||
ret = -EINPROGRESS;
|
||||
break;
|
||||
}
|
||||
@@ -1351,6 +1355,27 @@ void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total,
|
||||
} while (read_seqretry(&alloc->seqlock, seq));
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if the caller's consumption of nr from either avail or
|
||||
* freed would end up exceeding their budget relative to the starting
|
||||
* remaining snapshot they took.
|
||||
*/
|
||||
bool scoutfs_alloc_meta_low_since(struct scoutfs_alloc *alloc, u32 avail_start, u32 freed_start,
|
||||
u32 budget, u32 nr)
|
||||
{
|
||||
u32 avail_use;
|
||||
u32 freed_use;
|
||||
u32 avail;
|
||||
u32 freed;
|
||||
|
||||
scoutfs_alloc_meta_remaining(alloc, &avail, &freed);
|
||||
|
||||
avail_use = avail_start - avail;
|
||||
freed_use = freed_start - freed;
|
||||
|
||||
return ((avail_use + nr) > budget) || ((freed_use + nr) > budget);
|
||||
}
|
||||
|
||||
bool scoutfs_alloc_test_flag(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 flag)
|
||||
{
|
||||
|
||||
@@ -131,7 +131,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 total,
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved);
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_budget);
|
||||
int scoutfs_alloc_insert(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
|
||||
u64 start, u64 len);
|
||||
@@ -159,6 +159,8 @@ int scoutfs_alloc_splice_list(struct super_block *sb,
|
||||
bool scoutfs_alloc_meta_low(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 nr);
|
||||
void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total, u32 *freed_space);
|
||||
bool scoutfs_alloc_meta_low_since(struct scoutfs_alloc *alloc, u32 avail_start, u32 freed_start,
|
||||
u32 budget, u32 nr);
|
||||
bool scoutfs_alloc_test_flag(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 flag);
|
||||
|
||||
|
||||
@@ -1685,6 +1685,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
struct scoutfs_inode sinode;
|
||||
struct scoutfs_key key;
|
||||
bool clear_trying = false;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
int ret;
|
||||
@@ -1704,6 +1705,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
clear_trying = true;
|
||||
|
||||
/* can't delete if it's cached in local or remote mounts */
|
||||
if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) {
|
||||
@@ -1730,7 +1732,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
|
||||
|
||||
ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
|
||||
out:
|
||||
if (ldata)
|
||||
if (clear_trying)
|
||||
clear_bit(bit_nr, ldata->trying);
|
||||
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
|
||||
@@ -355,6 +355,7 @@ static int submit_send(struct super_block *sb,
|
||||
}
|
||||
if (rid != 0) {
|
||||
spin_unlock(&conn->lock);
|
||||
kfree(msend);
|
||||
return -ENOTCONN;
|
||||
}
|
||||
}
|
||||
@@ -991,6 +992,8 @@ static void scoutfs_net_listen_worker(struct work_struct *work)
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
acc_sock->sk->sk_allocation = GFP_NOFS;
|
||||
|
||||
/* inherit accepted request funcs from listening conn */
|
||||
acc_conn = scoutfs_net_alloc_conn(sb, conn->notify_up,
|
||||
conn->notify_down,
|
||||
@@ -1053,6 +1056,8 @@ static void scoutfs_net_connect_worker(struct work_struct *work)
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
sock->sk->sk_allocation = GFP_NOFS;
|
||||
|
||||
/* caller specified connect timeout */
|
||||
tv.tv_sec = conn->connect_timeout_ms / MSEC_PER_SEC;
|
||||
tv.tv_usec = (conn->connect_timeout_ms % MSEC_PER_SEC) * USEC_PER_MSEC;
|
||||
@@ -1341,10 +1346,12 @@ scoutfs_net_alloc_conn(struct super_block *sb,
|
||||
if (!conn)
|
||||
return NULL;
|
||||
|
||||
conn->info = kzalloc(info_size, GFP_NOFS);
|
||||
if (!conn->info) {
|
||||
kfree(conn);
|
||||
return NULL;
|
||||
if (info_size) {
|
||||
conn->info = kzalloc(info_size, GFP_NOFS);
|
||||
if (!conn->info) {
|
||||
kfree(conn);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
conn->workq = alloc_workqueue("scoutfs_net_%s",
|
||||
@@ -1450,6 +1457,8 @@ int scoutfs_net_bind(struct super_block *sb,
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
sock->sk->sk_allocation = GFP_NOFS;
|
||||
|
||||
optval = 1;
|
||||
ret = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
|
||||
(char *)&optval, sizeof(optval));
|
||||
|
||||
@@ -157,6 +157,15 @@ static int free_rid(struct omap_rid_list *list, struct omap_rid_entry *entry)
|
||||
return nr;
|
||||
}
|
||||
|
||||
static void free_rid_list(struct omap_rid_list *list)
|
||||
{
|
||||
struct omap_rid_entry *entry;
|
||||
struct omap_rid_entry *tmp;
|
||||
|
||||
list_for_each_entry_safe(entry, tmp, &list->head, head)
|
||||
free_rid(list, entry);
|
||||
}
|
||||
|
||||
static int copy_rids(struct omap_rid_list *to, struct omap_rid_list *from, spinlock_t *from_lock)
|
||||
{
|
||||
struct omap_rid_entry *entry;
|
||||
@@ -804,6 +813,10 @@ void scoutfs_omap_server_shutdown(struct super_block *sb)
|
||||
llist_for_each_entry_safe(req, tmp, requests, llnode)
|
||||
kfree(req);
|
||||
|
||||
spin_lock(&ominf->lock);
|
||||
free_rid_list(&ominf->rids);
|
||||
spin_unlock(&ominf->lock);
|
||||
|
||||
synchronize_rcu();
|
||||
}
|
||||
|
||||
@@ -864,6 +877,10 @@ void scoutfs_omap_destroy(struct super_block *sb)
|
||||
rhashtable_walk_stop(&iter);
|
||||
rhashtable_walk_exit(&iter);
|
||||
|
||||
spin_lock(&ominf->lock);
|
||||
free_rid_list(&ominf->rids);
|
||||
spin_unlock(&ominf->lock);
|
||||
|
||||
rhashtable_destroy(&ominf->group_ht);
|
||||
rhashtable_destroy(&ominf->req_ht);
|
||||
kfree(ominf);
|
||||
|
||||
@@ -694,13 +694,13 @@ static int alloc_move_refill_zoned(struct super_block *sb, struct scoutfs_alloc_
|
||||
|
||||
static int alloc_move_empty(struct super_block *sb,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 meta_reserved)
|
||||
struct scoutfs_alloc_root *src, u64 meta_budget)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
return scoutfs_alloc_move(sb, &server->alloc, &server->wri,
|
||||
dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0,
|
||||
meta_reserved);
|
||||
meta_budget);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1226,6 +1226,82 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The calling get_log_trees ran out of available blocks in its commit's
|
||||
* metadata allocator while moving extents from the log tree's
|
||||
* data_freed into the core data_avail. This finishes moving the
|
||||
* extents in as many additional commits as it takes. The logs mutex
|
||||
* is nested inside holding commits so we recheck the persistent item
|
||||
* each time we commit to make sure it's still what we think. The
|
||||
* caller is still going to send the item to the client so we update the
|
||||
* caller's each time we make progress. This is a best-effort attempt
|
||||
* to clean up and it's valid to leave extents in data_freed we don't
|
||||
* return errors to the caller. The client will continue the work later
|
||||
* in get_log_trees or as the rid is reclaimed.
|
||||
*/
|
||||
static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
const u64 rid = le64_to_cpu(lt->rid);
|
||||
const u64 nr = le64_to_cpu(lt->nr);
|
||||
struct scoutfs_log_trees drain;
|
||||
struct scoutfs_key key;
|
||||
COMMIT_HOLD(hold);
|
||||
int ret = 0;
|
||||
int err;
|
||||
|
||||
scoutfs_key_init_log_trees(&key, rid, nr);
|
||||
|
||||
while (lt->data_freed.total_len != 0) {
|
||||
server_hold_commit(sb, &hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
|
||||
ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX, &drain);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
/* careful to only keep draining the caller's specific open trans */
|
||||
if (drain.nr != lt->nr || drain.get_trans_seq != lt->get_trans_seq ||
|
||||
drain.commit_trans_seq != lt->commit_trans_seq || drain.flags != lt->flags) {
|
||||
ret = -ENOENT;
|
||||
break;
|
||||
}
|
||||
|
||||
ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri,
|
||||
&super->logs_root, &key);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
/* moving can modify and return errors, always update caller and item */
|
||||
mutex_lock(&server->alloc_mutex);
|
||||
ret = alloc_move_empty(sb, &super->data_alloc, &drain.data_freed,
|
||||
COMMIT_HOLD_ALLOC_BUDGET / 2);
|
||||
mutex_unlock(&server->alloc_mutex);
|
||||
if (ret == -EINPROGRESS)
|
||||
ret = 0;
|
||||
|
||||
*lt = drain;
|
||||
err = scoutfs_btree_force(sb, &server->alloc, &server->wri,
|
||||
&super->logs_root, &key, &drain, sizeof(drain));
|
||||
BUG_ON(err < 0); /* dirtying must guarantee success */
|
||||
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
if (ret < 0) {
|
||||
ret = 0; /* don't try to abort, ignoring ret */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* try to cleanly abort and write any partial dirty btree blocks, but ignore result */
|
||||
if (ret < 0) {
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
server_apply_commit(sb, &hold, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Give the client roots to all the trees that they'll use to build
|
||||
* their transaction.
|
||||
@@ -1267,6 +1343,7 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
char *err_str = NULL;
|
||||
u64 nr;
|
||||
int ret;
|
||||
int err;
|
||||
|
||||
if (arg_len != 0) {
|
||||
ret = -EINVAL;
|
||||
@@ -1310,16 +1387,27 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (ret != -ENOENT) {
|
||||
/* need to sync lt with respect to changes in other structures */
|
||||
scoutfs_key_init_log_trees(&key, le64_to_cpu(lt.rid), le64_to_cpu(lt.nr));
|
||||
ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri,
|
||||
&super->logs_root, &key);
|
||||
if (ret < 0) {
|
||||
err_str = "dirtying lt btree key";
|
||||
goto unlock;
|
||||
}
|
||||
}
|
||||
|
||||
/* drops and re-acquires the mutex and commit if it has to wait */
|
||||
ret = finalize_and_start_log_merge(sb, <, rid, &hold);
|
||||
if (ret < 0)
|
||||
goto unlock;
|
||||
goto update;
|
||||
|
||||
if (get_volopt_val(server, SCOUTFS_VOLOPT_DATA_ALLOC_ZONE_BLOCKS_NR, &data_zone_blocks)) {
|
||||
ret = get_data_alloc_zone_bits(sb, rid, exclusive, vacant, data_zone_blocks);
|
||||
if (ret < 0) {
|
||||
err_str = "getting alloc zone bits";
|
||||
goto unlock;
|
||||
goto update;
|
||||
}
|
||||
} else {
|
||||
data_zone_blocks = 0;
|
||||
@@ -1336,13 +1424,15 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
<.meta_freed);
|
||||
if (ret < 0) {
|
||||
err_str = "splicing committed meta_freed";
|
||||
goto unlock;
|
||||
goto update;
|
||||
}
|
||||
|
||||
ret = alloc_move_empty(sb, &super->data_alloc, <.data_freed, 0);
|
||||
ret = alloc_move_empty(sb, &super->data_alloc, <.data_freed, 100);
|
||||
if (ret == -EINPROGRESS)
|
||||
ret = 0;
|
||||
if (ret < 0) {
|
||||
err_str = "emptying committed data_freed";
|
||||
goto unlock;
|
||||
goto update;
|
||||
}
|
||||
|
||||
ret = scoutfs_alloc_fill_list(sb, &server->alloc, &server->wri,
|
||||
@@ -1351,7 +1441,7 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
SCOUTFS_SERVER_META_FILL_TARGET);
|
||||
if (ret < 0) {
|
||||
err_str = "filling meta_avail";
|
||||
goto unlock;
|
||||
goto update;
|
||||
}
|
||||
|
||||
if (le64_to_cpu(server->meta_avail->total_len) <= scoutfs_server_reserved_meta_blocks(sb))
|
||||
@@ -1364,7 +1454,7 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
exclusive, vacant, data_zone_blocks);
|
||||
if (ret < 0) {
|
||||
err_str = "refilling data_avail";
|
||||
goto unlock;
|
||||
goto update;
|
||||
}
|
||||
|
||||
if (le64_to_cpu(lt.data_avail.total_len) < SCOUTFS_SERVER_DATA_FILL_LO)
|
||||
@@ -1384,7 +1474,7 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
if (ret < 0) {
|
||||
zero_data_alloc_zone_bits(<);
|
||||
err_str = "setting data_avail zone bits";
|
||||
goto unlock;
|
||||
goto update;
|
||||
}
|
||||
|
||||
lt.data_alloc_zone_blocks = cpu_to_le64(data_zone_blocks);
|
||||
@@ -1393,13 +1483,18 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
/* give the transaction a new seq (must have been ==) */
|
||||
lt.get_trans_seq = cpu_to_le64(scoutfs_server_next_seq(sb));
|
||||
|
||||
update:
|
||||
/* update client's log tree's item */
|
||||
scoutfs_key_init_log_trees(&key, le64_to_cpu(lt.rid),
|
||||
le64_to_cpu(lt.nr));
|
||||
ret = scoutfs_btree_force(sb, &server->alloc, &server->wri,
|
||||
scoutfs_key_init_log_trees(&key, le64_to_cpu(lt.rid), le64_to_cpu(lt.nr));
|
||||
err = scoutfs_btree_force(sb, &server->alloc, &server->wri,
|
||||
&super->logs_root, &key, <, sizeof(lt));
|
||||
if (ret < 0)
|
||||
err_str = "updating log trees";
|
||||
BUG_ON(err < 0); /* can duplicate extents.. move dst in super, still in in lt src */
|
||||
if (err < 0) {
|
||||
if (ret == 0) {
|
||||
ret = err;
|
||||
err_str = "updating log trees";
|
||||
}
|
||||
}
|
||||
|
||||
unlock:
|
||||
if (unlock_alloc)
|
||||
@@ -1412,6 +1507,10 @@ out:
|
||||
scoutfs_err(sb, "error %d getting log trees for rid %016llx: %s",
|
||||
ret, rid, err_str);
|
||||
|
||||
/* try to drain excessive data_freed with additional commits, if needed, ignoring err */
|
||||
if (ret == 0)
|
||||
try_drain_data_freed(sb, <);
|
||||
|
||||
return scoutfs_net_response(sb, conn, cmd, id, ret, <, sizeof(lt));
|
||||
}
|
||||
|
||||
|
||||
@@ -496,7 +496,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
|
||||
ret = assign_random_id(sbi);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
goto out;
|
||||
|
||||
spin_lock_init(&sbi->next_ino_lock);
|
||||
spin_lock_init(&sbi->data_wait_root.lock);
|
||||
@@ -505,7 +505,7 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
/* parse options early for use during setup */
|
||||
ret = scoutfs_options_early_setup(sb, data);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
goto out;
|
||||
scoutfs_options_read(sb, &opts);
|
||||
|
||||
ret = sb_set_blocksize(sb, SCOUTFS_BLOCK_SM_SIZE);
|
||||
|
||||
@@ -10,7 +10,8 @@ BIN := src/createmany \
|
||||
src/bulk_create_paths \
|
||||
src/stage_tmpfile \
|
||||
src/find_xattrs \
|
||||
src/create_xattr_loop
|
||||
src/create_xattr_loop \
|
||||
src/fragmented_data_extents
|
||||
|
||||
DEPS := $(wildcard src/*.d)
|
||||
|
||||
|
||||
3
tests/golden/large-fragmented-free
Normal file
3
tests/golden/large-fragmented-free
Normal file
@@ -0,0 +1,3 @@
|
||||
== creating fragmented extents
|
||||
== unlink file with moved extents to free extents per block
|
||||
== cleanup
|
||||
@@ -9,6 +9,7 @@ fallocate.sh
|
||||
setattr_more.sh
|
||||
offline-extent-waiting.sh
|
||||
move-blocks.sh
|
||||
large-fragmented-free.sh
|
||||
enospc.sh
|
||||
srch-basic-functionality.sh
|
||||
simple-xattr-unit.sh
|
||||
|
||||
113
tests/src/fragmented_data_extents.c
Normal file
113
tests/src/fragmented_data_extents.c
Normal file
@@ -0,0 +1,113 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This creates fragmented data extents.
|
||||
*
|
||||
* A file is created that has alternating free and allocated extents.
|
||||
* This also results in the global allocator having the matching
|
||||
* fragmented free extent pattern. While that file is being created,
|
||||
* occasionally an allocated extent is moved to another file. This
|
||||
* results in a file that has fragmented extents at a given stride that
|
||||
* can be deleted to create free data extents with a given stride.
|
||||
*
|
||||
* We don't have hole punching so to do this quickly we use a goofy
|
||||
* combination of fallocate, truncate, and our move_blocks ioctl.
|
||||
*/
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <linux/types.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "ioctl.h"
|
||||
|
||||
#define BLOCK_SIZE 4096
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct scoutfs_ioctl_move_blocks mb = {0,};
|
||||
unsigned long long freed_extents;
|
||||
unsigned long long move_stride;
|
||||
unsigned long long i;
|
||||
int alloc_fd;
|
||||
int trunc_fd;
|
||||
off_t off;
|
||||
int ret;
|
||||
|
||||
if (argc != 5) {
|
||||
printf("%s <freed_extents> <move_stride> <alloc_file> <trunc_file>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
freed_extents = strtoull(argv[1], NULL, 0);
|
||||
move_stride = strtoull(argv[2], NULL, 0);
|
||||
|
||||
alloc_fd = open(argv[3], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
|
||||
if (alloc_fd == -1) {
|
||||
fprintf(stderr, "error opening %s: %d (%s)\n", argv[3], errno, strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
trunc_fd = open(argv[4], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
|
||||
if (trunc_fd == -1) {
|
||||
fprintf(stderr, "error opening %s: %d (%s)\n", argv[4], errno, strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (i = 0, off = 0; i < freed_extents; i++, off += BLOCK_SIZE * 2) {
|
||||
|
||||
ret = fallocate(alloc_fd, 0, off, BLOCK_SIZE * 2);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "fallocate at off %llu error: %d (%s)\n",
|
||||
(unsigned long long)off, errno, strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
ret = ftruncate(alloc_fd, off + BLOCK_SIZE);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "truncate to off %llu error: %d (%s)\n",
|
||||
(unsigned long long)off + BLOCK_SIZE, errno, strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ((i % move_stride) == 0) {
|
||||
mb.from_fd = alloc_fd;
|
||||
mb.from_off = off;
|
||||
mb.len = BLOCK_SIZE;
|
||||
mb.to_off = i * BLOCK_SIZE;
|
||||
|
||||
ret = ioctl(trunc_fd, SCOUTFS_IOC_MOVE_BLOCKS, &mb);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "move from off %llu error: %d (%s)\n",
|
||||
(unsigned long long)off,
|
||||
errno, strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (alloc_fd > -1)
|
||||
close(alloc_fd);
|
||||
if (trunc_fd > -1)
|
||||
close(trunc_fd);
|
||||
|
||||
return 0;
|
||||
}
|
||||
22
tests/tests/large-fragmented-free.sh
Normal file
22
tests/tests/large-fragmented-free.sh
Normal file
@@ -0,0 +1,22 @@
|
||||
#
|
||||
# Make sure the server can handle a transaction with a data_freed whose
|
||||
# blocks all hit different btree blocks in the main free list. It
|
||||
# probably has to be merged in multiple commits.
|
||||
#
|
||||
|
||||
t_require_commands fragmented_data_extents
|
||||
|
||||
EXTENTS_PER_BTREE_BLOCK=600
|
||||
EXTENTS_PER_LIST_BLOCK=8192
|
||||
FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))
|
||||
|
||||
echo "== creating fragmented extents"
|
||||
fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"
|
||||
|
||||
echo "== unlink file with moved extents to free extents per block"
|
||||
rm -f "$T_D0/move"
|
||||
|
||||
echo "== cleanup"
|
||||
rm -f "$T_D0/alloc"
|
||||
|
||||
t_pass
|
||||
@@ -597,7 +597,7 @@ format.
|
||||
.PD
|
||||
|
||||
.TP
|
||||
.BI "print META-DEVICE"
|
||||
.BI "print {-S|--skip-likely-huge} META-DEVICE"
|
||||
.sp
|
||||
Prints out all of the metadata in the file system. This makes no effort
|
||||
to ensure that the structures are consistent as they're traversed and
|
||||
@@ -607,6 +607,20 @@ output.
|
||||
.PD 0
|
||||
.TP
|
||||
.sp
|
||||
.B "-S, --skip-likely-huge"
|
||||
Skip printing structures that are likely to be very large. The
|
||||
structures that are skipped tend to be global and whose size tends to be
|
||||
related to the size of the volume. Examples of skipped structures include
|
||||
the global fs items, srch files, and metadata and data
|
||||
allocators. Similar structures that are not skipped are related to the
|
||||
number of mounts and are maintained at a relatively reasonable size.
|
||||
These include per-mount log trees, srch files, allocators, and the
|
||||
metadata allocators used by server commits.
|
||||
.sp
|
||||
Skipping the larger structures limits the print output to a relatively
|
||||
constant size rather than being a large multiple of the used metadata
|
||||
space of the volume making the output much more useful for inspection.
|
||||
.TP
|
||||
.B "META-DEVICE"
|
||||
The path to the metadata device for the filesystem whose metadata will be
|
||||
printed. Since this command reads via the host's buffer cache, it may not
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdbool.h>
|
||||
#include <ctype.h>
|
||||
#include <uuid/uuid.h>
|
||||
#include <sys/socket.h>
|
||||
@@ -989,9 +990,10 @@ static void print_super_block(struct scoutfs_super_block *super, u64 blkno)
|
||||
|
||||
struct print_args {
|
||||
char *meta_device;
|
||||
bool skip_likely_huge;
|
||||
};
|
||||
|
||||
static int print_volume(int fd)
|
||||
static int print_volume(int fd, struct print_args *args)
|
||||
{
|
||||
struct scoutfs_super_block *super = NULL;
|
||||
struct print_recursion_args pa;
|
||||
@@ -1041,23 +1043,26 @@ static int print_volume(int fd)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
for (i = 0; i < array_size(super->meta_alloc); i++) {
|
||||
snprintf(str, sizeof(str), "meta_alloc[%u]", i);
|
||||
err = print_btree(fd, super, str, &super->meta_alloc[i].root,
|
||||
if (!args->skip_likely_huge) {
|
||||
for (i = 0; i < array_size(super->meta_alloc); i++) {
|
||||
snprintf(str, sizeof(str), "meta_alloc[%u]", i);
|
||||
err = print_btree(fd, super, str, &super->meta_alloc[i].root,
|
||||
print_alloc_item, NULL);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
err = print_btree(fd, super, "data_alloc", &super->data_alloc.root,
|
||||
print_alloc_item, NULL);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
err = print_btree(fd, super, "data_alloc", &super->data_alloc.root,
|
||||
print_alloc_item, NULL);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
|
||||
err = print_btree(fd, super, "srch_root", &super->srch_root,
|
||||
print_srch_root_item, NULL);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
|
||||
err = print_btree(fd, super, "logs_root", &super->logs_root,
|
||||
print_log_trees_item, NULL);
|
||||
if (err && !ret)
|
||||
@@ -1065,19 +1070,23 @@ static int print_volume(int fd)
|
||||
|
||||
pa.super = super;
|
||||
pa.fd = fd;
|
||||
err = print_btree_leaf_items(fd, super, &super->srch_root.ref,
|
||||
print_srch_root_files, &pa);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
if (!args->skip_likely_huge) {
|
||||
err = print_btree_leaf_items(fd, super, &super->srch_root.ref,
|
||||
print_srch_root_files, &pa);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
}
|
||||
err = print_btree_leaf_items(fd, super, &super->logs_root.ref,
|
||||
print_log_trees_roots, &pa);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
|
||||
err = print_btree(fd, super, "fs_root", &super->fs_root,
|
||||
print_fs_item, NULL);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
if (!args->skip_likely_huge) {
|
||||
err = print_btree(fd, super, "fs_root", &super->fs_root,
|
||||
print_fs_item, NULL);
|
||||
if (err && !ret)
|
||||
ret = err;
|
||||
}
|
||||
|
||||
out:
|
||||
free(super);
|
||||
@@ -1098,7 +1107,7 @@ static int do_print(struct print_args *args)
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = print_volume(fd);
|
||||
ret = print_volume(fd, args);
|
||||
close(fd);
|
||||
return ret;
|
||||
};
|
||||
@@ -1108,6 +1117,9 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
struct print_args *args = state->input;
|
||||
|
||||
switch (key) {
|
||||
case 'S':
|
||||
args->skip_likely_huge = true;
|
||||
break;
|
||||
case ARGP_KEY_ARG:
|
||||
if (!args->meta_device)
|
||||
args->meta_device = strdup_or_error(state, arg);
|
||||
@@ -1125,8 +1137,13 @@ static int parse_opt(int key, char *arg, struct argp_state *state)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct argp_option options[] = {
|
||||
{ "skip-likely-huge", 'S', NULL, 0, "Skip large structures to minimize output size"},
|
||||
{ NULL }
|
||||
};
|
||||
|
||||
static struct argp argp = {
|
||||
NULL,
|
||||
options,
|
||||
parse_opt,
|
||||
"META-DEV",
|
||||
"Print metadata structures"
|
||||
|
||||
Reference in New Issue
Block a user