Merge pull request #98 from versity/zab/move_freed_many_commits

Zab/move freed many commits
This commit is contained in:
Zach Brown
2022-08-01 09:09:28 -07:00
committed by GitHub
9 changed files with 268 additions and 17 deletions

View File

@@ -892,12 +892,11 @@ static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *r
* -ENOENT is returned if we run out of extents in the source tree
* before moving the total.
*
* If meta_reserved is non-zero then -EINPROGRESS can be returned if the
* current meta allocator's avail blocks or room for freed blocks would
* have fallen under the reserved amount. The could have been
* successfully dirtied in this case but the number of blocks moved is
* not returned. The caller is expected to deal with the partial
* progress by commiting the dirty trees and examining the resulting
* If meta_budget is non-zero then -EINPROGRESS can be returned if the
* the caller's budget is consumed in the allocator during this call
* (though not necessarily by us, we don't have per-thread tracking of
* allocator consumption :/). The call can still have made progress and
* caller is expected commit the dirty trees and examining the resulting
* modified trees to see if they need to continue moving extents.
*
* The caller can specify that extents in the source tree should first
@@ -914,7 +913,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
struct scoutfs_block_writer *wri,
struct scoutfs_alloc_root *dst,
struct scoutfs_alloc_root *src, u64 total,
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved)
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_budget)
{
struct alloc_ext_args args = {
.alloc = alloc,
@@ -922,6 +921,8 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
};
struct scoutfs_extent found;
struct scoutfs_extent ext;
u32 avail_start = 0;
u32 freed_start = 0;
u64 moved = 0;
u64 count;
int ret = 0;
@@ -932,6 +933,9 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
vacant = NULL;
}
if (meta_budget != 0)
scoutfs_alloc_meta_remaining(alloc, &avail_start, &freed_start);
while (moved < total) {
count = total - moved;
@@ -964,10 +968,10 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
if (ret < 0)
break;
if (meta_reserved != 0 &&
scoutfs_alloc_meta_low(sb, alloc, meta_reserved +
extent_mod_blocks(src->root.height) +
extent_mod_blocks(dst->root.height))) {
if (meta_budget != 0 &&
scoutfs_alloc_meta_low_since(alloc, avail_start, freed_start, meta_budget,
extent_mod_blocks(src->root.height) +
extent_mod_blocks(dst->root.height))) {
ret = -EINPROGRESS;
break;
}
@@ -1351,6 +1355,27 @@ void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total,
} while (read_seqretry(&alloc->seqlock, seq));
}
/*
* Returns true if the caller's consumption of nr from either avail or
* freed would end up exceeding their budget relative to the starting
* remaining snapshot they took.
*/
bool scoutfs_alloc_meta_low_since(struct scoutfs_alloc *alloc, u32 avail_start, u32 freed_start,
u32 budget, u32 nr)
{
u32 avail_use;
u32 freed_use;
u32 avail;
u32 freed;
scoutfs_alloc_meta_remaining(alloc, &avail, &freed);
avail_use = avail_start - avail;
freed_use = freed_start - freed;
return ((avail_use + nr) > budget) || ((freed_use + nr) > budget);
}
bool scoutfs_alloc_test_flag(struct super_block *sb,
struct scoutfs_alloc *alloc, u32 flag)
{

View File

@@ -131,7 +131,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
struct scoutfs_block_writer *wri,
struct scoutfs_alloc_root *dst,
struct scoutfs_alloc_root *src, u64 total,
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved);
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_budget);
int scoutfs_alloc_insert(struct super_block *sb, struct scoutfs_alloc *alloc,
struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
u64 start, u64 len);
@@ -159,6 +159,8 @@ int scoutfs_alloc_splice_list(struct super_block *sb,
bool scoutfs_alloc_meta_low(struct super_block *sb,
struct scoutfs_alloc *alloc, u32 nr);
void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total, u32 *freed_space);
bool scoutfs_alloc_meta_low_since(struct scoutfs_alloc *alloc, u32 avail_start, u32 freed_start,
u32 budget, u32 nr);
bool scoutfs_alloc_test_flag(struct super_block *sb,
struct scoutfs_alloc *alloc, u32 flag);

View File

@@ -1685,6 +1685,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
struct scoutfs_lock *lock = NULL;
struct scoutfs_inode sinode;
struct scoutfs_key key;
bool clear_trying = false;
u64 group_nr;
int bit_nr;
int ret;
@@ -1704,6 +1705,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
ret = 0;
goto out;
}
clear_trying = true;
/* can't delete if it's cached in local or remote mounts */
if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) {
@@ -1730,7 +1732,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
out:
if (ldata)
if (clear_trying)
clear_bit(bit_nr, ldata->trying);
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);

View File

@@ -694,13 +694,13 @@ static int alloc_move_refill_zoned(struct super_block *sb, struct scoutfs_alloc_
static int alloc_move_empty(struct super_block *sb,
struct scoutfs_alloc_root *dst,
struct scoutfs_alloc_root *src, u64 meta_reserved)
struct scoutfs_alloc_root *src, u64 meta_budget)
{
DECLARE_SERVER_INFO(sb, server);
return scoutfs_alloc_move(sb, &server->alloc, &server->wri,
dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0,
meta_reserved);
meta_budget);
}
/*
@@ -1226,6 +1226,82 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
return ret;
}
/*
* The calling get_log_trees ran out of available blocks in its commit's
* metadata allocator while moving extents from the log tree's
* data_freed into the core data_avail. This finishes moving the
* extents in as many additional commits as it takes. The logs mutex
* is nested inside holding commits so we recheck the persistent item
* each time we commit to make sure it's still what we think. The
* caller is still going to send the item to the client so we update the
* caller's each time we make progress. This is a best-effort attempt
* to clean up and it's valid to leave extents in data_freed we don't
* return errors to the caller. The client will continue the work later
* in get_log_trees or as the rid is reclaimed.
*/
static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
{
DECLARE_SERVER_INFO(sb, server);
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
const u64 rid = le64_to_cpu(lt->rid);
const u64 nr = le64_to_cpu(lt->nr);
struct scoutfs_log_trees drain;
struct scoutfs_key key;
COMMIT_HOLD(hold);
int ret = 0;
int err;
scoutfs_key_init_log_trees(&key, rid, nr);
while (lt->data_freed.total_len != 0) {
server_hold_commit(sb, &hold);
mutex_lock(&server->logs_mutex);
ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX, &drain);
if (ret < 0)
break;
/* careful to only keep draining the caller's specific open trans */
if (drain.nr != lt->nr || drain.get_trans_seq != lt->get_trans_seq ||
drain.commit_trans_seq != lt->commit_trans_seq || drain.flags != lt->flags) {
ret = -ENOENT;
break;
}
ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri,
&super->logs_root, &key);
if (ret < 0)
break;
/* moving can modify and return errors, always update caller and item */
mutex_lock(&server->alloc_mutex);
ret = alloc_move_empty(sb, &super->data_alloc, &drain.data_freed,
COMMIT_HOLD_ALLOC_BUDGET / 2);
mutex_unlock(&server->alloc_mutex);
if (ret == -EINPROGRESS)
ret = 0;
*lt = drain;
err = scoutfs_btree_force(sb, &server->alloc, &server->wri,
&super->logs_root, &key, &drain, sizeof(drain));
BUG_ON(err < 0); /* dirtying must guarantee success */
mutex_unlock(&server->logs_mutex);
ret = server_apply_commit(sb, &hold, ret);
if (ret < 0) {
ret = 0; /* don't try to abort, ignoring ret */
break;
}
}
/* try to cleanly abort and write any partial dirty btree blocks, but ignore result */
if (ret < 0) {
mutex_unlock(&server->logs_mutex);
server_apply_commit(sb, &hold, 0);
}
}
/*
* Give the client roots to all the trees that they'll use to build
* their transaction.
@@ -1351,7 +1427,9 @@ static int server_get_log_trees(struct super_block *sb,
goto update;
}
ret = alloc_move_empty(sb, &super->data_alloc, &lt.data_freed, 0);
ret = alloc_move_empty(sb, &super->data_alloc, &lt.data_freed, 100);
if (ret == -EINPROGRESS)
ret = 0;
if (ret < 0) {
err_str = "emptying committed data_freed";
goto update;
@@ -1429,6 +1507,10 @@ out:
scoutfs_err(sb, "error %d getting log trees for rid %016llx: %s",
ret, rid, err_str);
/* try to drain excessive data_freed with additional commits, if needed, ignoring err */
if (ret == 0)
try_drain_data_freed(sb, &lt);
return scoutfs_net_response(sb, conn, cmd, id, ret, &lt, sizeof(lt));
}

View File

@@ -10,7 +10,8 @@ BIN := src/createmany \
src/bulk_create_paths \
src/stage_tmpfile \
src/find_xattrs \
src/create_xattr_loop
src/create_xattr_loop \
src/fragmented_data_extents
DEPS := $(wildcard src/*.d)

View File

@@ -0,0 +1,3 @@
== creating fragmented extents
== unlink file with moved extents to free extents per block
== cleanup

View File

@@ -9,6 +9,7 @@ fallocate.sh
setattr_more.sh
offline-extent-waiting.sh
move-blocks.sh
large-fragmented-free.sh
enospc.sh
srch-basic-functionality.sh
simple-xattr-unit.sh

View File

@@ -0,0 +1,113 @@
/*
* Copyright (C) 2021 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
/*
* This creates fragmented data extents.
*
* A file is created that has alternating free and allocated extents.
* This also results in the global allocator having the matching
* fragmented free extent pattern. While that file is being created,
* occasionally an allocated extent is moved to another file. This
* results in a file that has fragmented extents at a given stride that
* can be deleted to create free data extents with a given stride.
*
* We don't have hole punching so to do this quickly we use a goofy
* combination of fallocate, truncate, and our move_blocks ioctl.
*/
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include <errno.h>
#include <linux/types.h>
#include <assert.h>
#include "ioctl.h"
#define BLOCK_SIZE 4096
int main(int argc, char **argv)
{
struct scoutfs_ioctl_move_blocks mb = {0,};
unsigned long long freed_extents;
unsigned long long move_stride;
unsigned long long i;
int alloc_fd;
int trunc_fd;
off_t off;
int ret;
if (argc != 5) {
printf("%s <freed_extents> <move_stride> <alloc_file> <trunc_file>\n", argv[0]);
return 1;
}
freed_extents = strtoull(argv[1], NULL, 0);
move_stride = strtoull(argv[2], NULL, 0);
alloc_fd = open(argv[3], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
if (alloc_fd == -1) {
fprintf(stderr, "error opening %s: %d (%s)\n", argv[3], errno, strerror(errno));
exit(1);
}
trunc_fd = open(argv[4], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
if (trunc_fd == -1) {
fprintf(stderr, "error opening %s: %d (%s)\n", argv[4], errno, strerror(errno));
exit(1);
}
for (i = 0, off = 0; i < freed_extents; i++, off += BLOCK_SIZE * 2) {
ret = fallocate(alloc_fd, 0, off, BLOCK_SIZE * 2);
if (ret < 0) {
fprintf(stderr, "fallocate at off %llu error: %d (%s)\n",
(unsigned long long)off, errno, strerror(errno));
exit(1);
}
ret = ftruncate(alloc_fd, off + BLOCK_SIZE);
if (ret < 0) {
fprintf(stderr, "truncate to off %llu error: %d (%s)\n",
(unsigned long long)off + BLOCK_SIZE, errno, strerror(errno));
exit(1);
}
if ((i % move_stride) == 0) {
mb.from_fd = alloc_fd;
mb.from_off = off;
mb.len = BLOCK_SIZE;
mb.to_off = i * BLOCK_SIZE;
ret = ioctl(trunc_fd, SCOUTFS_IOC_MOVE_BLOCKS, &mb);
if (ret < 0) {
fprintf(stderr, "move from off %llu error: %d (%s)\n",
(unsigned long long)off,
errno, strerror(errno));
}
}
}
if (alloc_fd > -1)
close(alloc_fd);
if (trunc_fd > -1)
close(trunc_fd);
return 0;
}

View File

@@ -0,0 +1,22 @@
#
# Make sure the server can handle a transaction with a data_freed whose
# blocks all hit different btree blocks in the main free list. It
# probably has to be merged in multiple commits.
#
t_require_commands fragmented_data_extents
EXTENTS_PER_BTREE_BLOCK=600
EXTENTS_PER_LIST_BLOCK=8192
FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))
echo "== creating fragmented extents"
fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"
echo "== unlink file with moved extents to free extents per block"
rm -f "$T_D0/move"
echo "== cleanup"
rm -f "$T_D0/alloc"
t_pass