Compare commits

..

4 Commits

Author SHA1 Message Date
Zach Brown
a49584739a Use count/scan objects shrinking interface
Move to the more recent interfaces for counting and scanning cached
objects to shrink.

Signed-off-by: Zach Brown <zab@versity.com>
2022-08-02 15:29:48 -07:00
Zach Brown
fd1c4777c2 Use more modern bio interfaces
Move towards modern bio intefaces, while unfortunately carrying along a
bunch of compat functions that let us still work with the old
incompatible interfaces.

Signed-off-by: Zach Brown <zab@versity.com>
2022-08-01 14:10:40 -07:00
Zach Brown
0b0beb2830 Use memalloc_nofs_save
memalloc_nofs_save() was introduced as preferential to trying to use GFP
flags to indicate that a task should not recurse during reclaim.  We use
it instead of the _noio_ we were using before.

Signed-off-by: Zach Brown <zab@versity.com>
2022-08-01 09:25:17 -07:00
Zach Brown
bb006191e0 Use percpu_counter_add_batch
__percpu_counter_add_batch was renamed to make it clear that the __
doesn't mean it's less safe, as it means in other calls in the API, but
just that it takes an additional parameter.

Signed-off-by: Zach Brown <zab@versity.com>
2022-07-21 11:23:02 -07:00
16 changed files with 228 additions and 338 deletions

View File

@@ -1,29 +1,6 @@
Versity ScoutFS Release Notes
=============================
---
v1.7
\
*Aug 26, 2022*
* **Fixed possible persistent errors moving freed data extents**
\
Fixed a case where the server could hit persistent errors trying to
move a client's freed extents in one commit. The client had to free
a large number of extents that occupied distant positions in the
global free extent btree. Very large fragmented files could cause
this. The server now moves the freed extents in multiple commits and
can always ensure forward progress.
* **Fixed possible persistent errors from freed duplicate extents**
\
Background orphan deletion wasn't properly synchronizing with
foreground tasks deleting very large files. If a deletion took long
enough then background deletion could also attempt to delete inode items
while the deletion was making progress. This could create duplicate
deletions of data extent items which causes the server to abort when
it later discovers the duplicate extents as it merges free lists.
---
v1.6
\

View File

@@ -46,6 +46,10 @@ scoutfs-y += \
volopt.o \
xattr.o
ifdef KC_BUILD_KERNELCOMPAT
scoutfs-y += kernelcompat.o
endif
#
# The raw types aren't available in userspace headers. Make sure all
# the types we use in the headers are the exported __ versions.

View File

@@ -34,3 +34,52 @@ endif
ifneq (,$(shell grep 'FMODE_KABI_ITERATE' include/linux/fs.h))
ccflags-y += -DKC_FMODE_KABI_ITERATE
endif
#
# v4.11-12447-g104b4e5139fe
#
# Renamed __percpu_counter_add to percpu_counter_add_batch to clarify
# that the __ wasn't less safe, just took an extra parameter.
#
ifneq (,$(shell grep 'percpu_counter_add_batch' include/linux/percpu_counter.h))
ccflags-y += -DKC_PERCPU_COUNTER_ADD_BATCH
endif
#
# v4.11-4550-g7dea19f9ee63
#
# Introduced memalloc_nofs_{save,restore} preferred instead of _noio_.
#
ifneq (,$(shell grep 'memalloc_nofs_save' include/linux/sched/mm.h))
ccflags-y += -DKC_MEMALLOC_NOFS_SAVE
endif
#
# v4.7-12414-g1eff9d322a44
#
# Renamed bi_rw to bi_opf to force old code to catch up. We use it as a
# single switch between old and new bio structures.
#
ifneq (,$(shell grep 'bi_opf' include/linux/blk_types.h))
ccflags-y += -DKC_BIO_BI_OPF
endif
#
# v4.12-rc2-201-g4e4cbee93d56
#
# Moves to bi_status BLK_STS_ API instead of having a mix of error
# end_io args or bi_error.
#
ifneq (,$(shell grep 'bi_status' include/linux/blk_types.h))
ccflags-y += -DKC_BIO_BI_STATUS
endif
#
# v3.11-8765-ga0b02131c5fc
#
# Remove the old ->shrink() API, ->{scan,count}_objects is preferred.
#
ifneq (,$(shell grep '(*shrink)' include/linux/shrinker.h))
ccflags-y += -DKC_SHRINKER_SHRINK
KC_BUILD_KERNELCOMPAT=1
endif

View File

@@ -892,11 +892,12 @@ static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *r
* -ENOENT is returned if we run out of extents in the source tree
* before moving the total.
*
* If meta_budget is non-zero then -EINPROGRESS can be returned if the
* the caller's budget is consumed in the allocator during this call
* (though not necessarily by us, we don't have per-thread tracking of
* allocator consumption :/). The call can still have made progress and
* caller is expected commit the dirty trees and examining the resulting
* If meta_reserved is non-zero then -EINPROGRESS can be returned if the
* current meta allocator's avail blocks or room for freed blocks would
* have fallen under the reserved amount. The could have been
* successfully dirtied in this case but the number of blocks moved is
* not returned. The caller is expected to deal with the partial
* progress by commiting the dirty trees and examining the resulting
* modified trees to see if they need to continue moving extents.
*
* The caller can specify that extents in the source tree should first
@@ -913,7 +914,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
struct scoutfs_block_writer *wri,
struct scoutfs_alloc_root *dst,
struct scoutfs_alloc_root *src, u64 total,
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_budget)
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved)
{
struct alloc_ext_args args = {
.alloc = alloc,
@@ -921,8 +922,6 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
};
struct scoutfs_extent found;
struct scoutfs_extent ext;
u32 avail_start = 0;
u32 freed_start = 0;
u64 moved = 0;
u64 count;
int ret = 0;
@@ -933,9 +932,6 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
vacant = NULL;
}
if (meta_budget != 0)
scoutfs_alloc_meta_remaining(alloc, &avail_start, &freed_start);
while (moved < total) {
count = total - moved;
@@ -968,10 +964,10 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
if (ret < 0)
break;
if (meta_budget != 0 &&
scoutfs_alloc_meta_low_since(alloc, avail_start, freed_start, meta_budget,
extent_mod_blocks(src->root.height) +
extent_mod_blocks(dst->root.height))) {
if (meta_reserved != 0 &&
scoutfs_alloc_meta_low(sb, alloc, meta_reserved +
extent_mod_blocks(src->root.height) +
extent_mod_blocks(dst->root.height))) {
ret = -EINPROGRESS;
break;
}
@@ -1355,27 +1351,6 @@ void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total,
} while (read_seqretry(&alloc->seqlock, seq));
}
/*
* Returns true if the caller's consumption of nr from either avail or
* freed would end up exceeding their budget relative to the starting
* remaining snapshot they took.
*/
bool scoutfs_alloc_meta_low_since(struct scoutfs_alloc *alloc, u32 avail_start, u32 freed_start,
u32 budget, u32 nr)
{
u32 avail_use;
u32 freed_use;
u32 avail;
u32 freed;
scoutfs_alloc_meta_remaining(alloc, &avail, &freed);
avail_use = avail_start - avail;
freed_use = freed_start - freed;
return ((avail_use + nr) > budget) || ((freed_use + nr) > budget);
}
bool scoutfs_alloc_test_flag(struct super_block *sb,
struct scoutfs_alloc *alloc, u32 flag)
{

View File

@@ -131,7 +131,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
struct scoutfs_block_writer *wri,
struct scoutfs_alloc_root *dst,
struct scoutfs_alloc_root *src, u64 total,
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_budget);
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved);
int scoutfs_alloc_insert(struct super_block *sb, struct scoutfs_alloc *alloc,
struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
u64 start, u64 len);
@@ -159,8 +159,6 @@ int scoutfs_alloc_splice_list(struct super_block *sb,
bool scoutfs_alloc_meta_low(struct super_block *sb,
struct scoutfs_alloc *alloc, u32 nr);
void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total, u32 *freed_space);
bool scoutfs_alloc_meta_low_since(struct scoutfs_alloc *alloc, u32 avail_start, u32 freed_start,
u32 budget, u32 nr);
bool scoutfs_alloc_test_flag(struct super_block *sb,
struct scoutfs_alloc *alloc, u32 flag);

View File

@@ -21,6 +21,7 @@
#include <linux/blkdev.h>
#include <linux/rhashtable.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
#include "format.h"
#include "super.h"
@@ -57,7 +58,7 @@ struct block_info {
atomic64_t access_counter;
struct rhashtable ht;
wait_queue_head_t waitq;
struct shrinker shrinker;
KC_DEFINE_SHRINKER(shrinker);
struct work_struct free_work;
struct llist_head free_llist;
};
@@ -128,7 +129,7 @@ static __le32 block_calc_crc(struct scoutfs_block_header *hdr, u32 size)
static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
{
struct block_private *bp;
unsigned int noio_flags;
unsigned int nofs_flags;
/*
* If we had multiple blocks per page we'd need to be a little
@@ -156,9 +157,9 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
* spurious reclaim-on dependencies and warnings.
*/
lockdep_off();
noio_flags = memalloc_noio_save();
nofs_flags = memalloc_nofs_save();
bp->virt = __vmalloc(SCOUTFS_BLOCK_LG_SIZE, GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL);
memalloc_noio_restore(noio_flags);
memalloc_nofs_restore(nofs_flags);
lockdep_on();
if (!bp->virt) {
@@ -436,11 +437,10 @@ static void block_remove_all(struct super_block *sb)
* possible. Final freeing, verifying checksums, and unlinking errored
* blocks are all done by future users of the blocks.
*/
static void block_end_io(struct super_block *sb, int rw,
static void block_end_io(struct super_block *sb, unsigned int opf,
struct block_private *bp, int err)
{
DECLARE_BLOCK_INFO(sb, binf);
bool is_read = !(rw & WRITE);
if (err) {
scoutfs_inc_counter(sb, block_cache_end_io_error);
@@ -450,7 +450,7 @@ static void block_end_io(struct super_block *sb, int rw,
if (!atomic_dec_and_test(&bp->io_count))
return;
if (is_read && !test_bit(BLOCK_BIT_ERROR, &bp->bits))
if (!op_is_write(opf) && !test_bit(BLOCK_BIT_ERROR, &bp->bits))
set_bit(BLOCK_BIT_UPTODATE, &bp->bits);
clear_bit(BLOCK_BIT_IO_BUSY, &bp->bits);
@@ -463,13 +463,13 @@ static void block_end_io(struct super_block *sb, int rw,
wake_up(&binf->waitq);
}
static void block_bio_end_io(struct bio *bio, int err)
static void KC_DECLARE_BIO_END_IO(block_bio_end_io, struct bio *bio)
{
struct block_private *bp = bio->bi_private;
struct super_block *sb = bp->sb;
TRACE_BLOCK(end_io, bp);
block_end_io(sb, bio->bi_rw, bp, err);
block_end_io(sb, kc_bio_get_opf(bio), bp, kc_bio_get_errno(bio));
bio_put(bio);
}
@@ -477,7 +477,7 @@ static void block_bio_end_io(struct bio *bio, int err)
* Kick off IO for a single block.
*/
static int block_submit_bio(struct super_block *sb, struct block_private *bp,
int rw)
unsigned int opf)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct bio *bio = NULL;
@@ -510,8 +510,9 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
break;
}
bio->bi_sector = sector + (off >> 9);
bio->bi_bdev = sbi->meta_bdev;
kc_bio_set_opf(bio, opf);
kc_bio_set_sector(bio, sector + (off >> 9));
bio_set_dev(bio, sbi->meta_bdev);
bio->bi_end_io = block_bio_end_io;
bio->bi_private = bp;
@@ -528,18 +529,18 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
BUG();
if (!bio_add_page(bio, page, PAGE_SIZE, 0)) {
submit_bio(rw, bio);
submit_bio(bio);
bio = NULL;
}
}
if (bio)
submit_bio(rw, bio);
submit_bio(bio);
blk_finish_plug(&plug);
/* let racing end_io know we're done */
block_end_io(sb, rw, bp, ret);
block_end_io(sb, opf, bp, ret);
return ret;
}
@@ -640,7 +641,7 @@ static struct block_private *block_read(struct super_block *sb, u64 blkno)
if (!test_bit(BLOCK_BIT_UPTODATE, &bp->bits) &&
test_and_clear_bit(BLOCK_BIT_NEW, &bp->bits)) {
ret = block_submit_bio(sb, bp, READ);
ret = block_submit_bio(sb, bp, REQ_OP_READ);
if (ret < 0)
goto out;
}
@@ -939,7 +940,7 @@ int scoutfs_block_writer_write(struct super_block *sb,
/* retry previous write errors */
clear_bit(BLOCK_BIT_ERROR, &bp->bits);
ret = block_submit_bio(sb, bp, WRITE);
ret = block_submit_bio(sb, bp, REQ_OP_WRITE);
if (ret < 0)
break;
}
@@ -1039,6 +1040,17 @@ u64 scoutfs_block_writer_dirty_bytes(struct super_block *sb,
return wri->nr_dirty_blocks * SCOUTFS_BLOCK_LG_SIZE;
}
static unsigned long block_count_objects(struct shrinker *shrink, struct shrink_control *sc)
{
struct block_info *binf = container_of(shrink, struct block_info, shrinker);
struct super_block *sb = binf->sb;
scoutfs_inc_counter(sb, block_cache_scan_objects);
return min_t(u64, (u64)atomic_read(&binf->total_inserted) * SCOUTFS_BLOCK_LG_PAGES_PER,
ULONG_MAX / 2); /* magic numbers as we approach ~0UL :/ */
}
/*
* Remove a number of cached blocks that haven't been used recently.
*
@@ -1059,23 +1071,19 @@ u64 scoutfs_block_writer_dirty_bytes(struct super_block *sb,
* atomically remove blocks when the only references are ours and the
* hash table.
*/
static int block_shrink(struct shrinker *shrink, struct shrink_control *sc)
static unsigned long block_scan_objects(struct shrinker *shrink, struct shrink_control *sc)
{
struct block_info *binf = container_of(shrink, struct block_info,
shrinker);
struct block_info *binf = container_of(shrink, struct block_info, shrinker);
struct super_block *sb = binf->sb;
struct rhashtable_iter iter;
struct block_private *bp;
unsigned long freed = 0;
unsigned long nr;
u64 recently;
nr = sc->nr_to_scan;
if (nr == 0)
goto out;
scoutfs_inc_counter(sb, block_cache_scan_objects);
scoutfs_inc_counter(sb, block_cache_shrink);
nr = DIV_ROUND_UP(nr, SCOUTFS_BLOCK_LG_PAGES_PER);
nr = DIV_ROUND_UP(sc->nr_to_scan, SCOUTFS_BLOCK_LG_PAGES_PER);
restart:
recently = accessed_recently(binf);
@@ -1118,6 +1126,7 @@ restart:
if (block_remove_solo(sb, bp)) {
scoutfs_inc_counter(sb, block_cache_shrink_remove);
TRACE_BLOCK(shrink, bp);
freed++;
nr--;
}
block_put(sb, bp);
@@ -1126,9 +1135,8 @@ restart:
rhashtable_walk_stop(&iter);
rhashtable_walk_exit(&iter);
out:
return min_t(u64, (u64)atomic_read(&binf->total_inserted) * SCOUTFS_BLOCK_LG_PAGES_PER,
INT_MAX);
return freed;
}
struct sm_block_completion {
@@ -1136,11 +1144,11 @@ struct sm_block_completion {
int err;
};
static void sm_block_bio_end_io(struct bio *bio, int err)
static void KC_DECLARE_BIO_END_IO(sm_block_bio_end_io, struct bio *bio)
{
struct sm_block_completion *sbc = bio->bi_private;
sbc->err = err;
sbc->err = kc_bio_get_errno(bio);
complete(&sbc->comp);
bio_put(bio);
}
@@ -1155,9 +1163,8 @@ static void sm_block_bio_end_io(struct bio *bio, int err)
* only layer that sees the full block buffer so we pass the calculated
* crc to the caller for them to check in their context.
*/
static int sm_block_io(struct super_block *sb, struct block_device *bdev, int rw, u64 blkno,
struct scoutfs_block_header *hdr, size_t len,
__le32 *blk_crc)
static int sm_block_io(struct super_block *sb, struct block_device *bdev, unsigned int opf,
u64 blkno, struct scoutfs_block_header *hdr, size_t len, __le32 *blk_crc)
{
struct scoutfs_block_header *pg_hdr;
struct sm_block_completion sbc;
@@ -1171,7 +1178,7 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, int rw
return -EIO;
if (WARN_ON_ONCE(len > SCOUTFS_BLOCK_SM_SIZE) ||
WARN_ON_ONCE(!(rw & WRITE) && !blk_crc))
WARN_ON_ONCE(!op_is_write(opf) && !blk_crc))
return -EINVAL;
page = alloc_page(GFP_NOFS);
@@ -1180,7 +1187,7 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, int rw
pg_hdr = page_address(page);
if (rw & WRITE) {
if (op_is_write(opf)) {
memcpy(pg_hdr, hdr, len);
if (len < SCOUTFS_BLOCK_SM_SIZE)
memset((char *)pg_hdr + len, 0,
@@ -1194,8 +1201,9 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, int rw
goto out;
}
bio->bi_sector = blkno << (SCOUTFS_BLOCK_SM_SHIFT - 9);
bio->bi_bdev = bdev;
bio->bi_opf = opf | REQ_SYNC;
kc_bio_set_sector(bio, blkno << (SCOUTFS_BLOCK_SM_SHIFT - 9));
bio_set_dev(bio, bdev);
bio->bi_end_io = sm_block_bio_end_io;
bio->bi_private = &sbc;
bio_add_page(bio, page, SCOUTFS_BLOCK_SM_SIZE, 0);
@@ -1203,12 +1211,12 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, int rw
init_completion(&sbc.comp);
sbc.err = 0;
submit_bio((rw & WRITE) ? WRITE_SYNC : READ_SYNC, bio);
submit_bio(bio);
wait_for_completion(&sbc.comp);
ret = sbc.err;
if (ret == 0 && !(rw & WRITE)) {
if (ret == 0 && !op_is_write(opf)) {
memcpy(hdr, pg_hdr, len);
*blk_crc = block_calc_crc(pg_hdr, SCOUTFS_BLOCK_SM_SIZE);
}
@@ -1222,14 +1230,14 @@ int scoutfs_block_read_sm(struct super_block *sb,
struct scoutfs_block_header *hdr, size_t len,
__le32 *blk_crc)
{
return sm_block_io(sb, bdev, READ, blkno, hdr, len, blk_crc);
return sm_block_io(sb, bdev, REQ_OP_READ, blkno, hdr, len, blk_crc);
}
int scoutfs_block_write_sm(struct super_block *sb,
struct block_device *bdev, u64 blkno,
struct scoutfs_block_header *hdr, size_t len)
{
return sm_block_io(sb, bdev, WRITE, blkno, hdr, len, NULL);
return sm_block_io(sb, bdev, REQ_OP_WRITE, blkno, hdr, len, NULL);
}
int scoutfs_block_setup(struct super_block *sb)
@@ -1254,7 +1262,8 @@ int scoutfs_block_setup(struct super_block *sb)
atomic_set(&binf->total_inserted, 0);
atomic64_set(&binf->access_counter, 0);
init_waitqueue_head(&binf->waitq);
binf->shrinker.shrink = block_shrink;
KC_INIT_SHRINKER_FUNCS(struct block_info, shrinker,
&binf->shrinker, block_count_objects, block_scan_objects);
binf->shrinker.seeks = DEFAULT_SEEKS;
register_shrinker(&binf->shrinker);
INIT_WORK(&binf->free_work, block_free_work);

View File

@@ -30,6 +30,8 @@
EXPAND_COUNTER(block_cache_free) \
EXPAND_COUNTER(block_cache_free_work) \
EXPAND_COUNTER(block_cache_remove_stale) \
EXPAND_COUNTER(block_cache_count_objects) \
EXPAND_COUNTER(block_cache_scan_objects) \
EXPAND_COUNTER(block_cache_shrink) \
EXPAND_COUNTER(block_cache_shrink_next) \
EXPAND_COUNTER(block_cache_shrink_recent) \
@@ -235,12 +237,12 @@ struct scoutfs_counters {
#define SCOUTFS_PCPU_COUNTER_BATCH (1 << 30)
#define scoutfs_inc_counter(sb, which) \
__percpu_counter_add(&SCOUTFS_SB(sb)->counters->which, 1, \
SCOUTFS_PCPU_COUNTER_BATCH)
percpu_counter_add_batch(&SCOUTFS_SB(sb)->counters->which, 1, \
SCOUTFS_PCPU_COUNTER_BATCH)
#define scoutfs_add_counter(sb, which, cnt) \
__percpu_counter_add(&SCOUTFS_SB(sb)->counters->which, cnt, \
SCOUTFS_PCPU_COUNTER_BATCH)
percpu_counter_add_batch(&SCOUTFS_SB(sb)->counters->which, cnt, \
SCOUTFS_PCPU_COUNTER_BATCH)
void __init scoutfs_init_counters(void);
int scoutfs_setup_counters(struct super_block *sb);

View File

@@ -1685,7 +1685,6 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
struct scoutfs_lock *lock = NULL;
struct scoutfs_inode sinode;
struct scoutfs_key key;
bool clear_trying = false;
u64 group_nr;
int bit_nr;
int ret;
@@ -1705,7 +1704,6 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
ret = 0;
goto out;
}
clear_trying = true;
/* can't delete if it's cached in local or remote mounts */
if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) {
@@ -1732,7 +1730,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
out:
if (clear_trying)
if (ldata)
clear_bit(bit_nr, ldata->trying);
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);

23
kmod/src/kernelcompat.c Normal file
View File

@@ -0,0 +1,23 @@
#include "kernelcompat.h"
#ifdef KC_SHRINKER_SHRINK
#include <linux/shrinker.h>
/*
* If a target doesn't have that .{count,scan}_objects() interface then
* we have a .shrink() helper that performs the shrink work in terms of
* count/scan.
*/
int kc_shrink_wrapper(struct shrinker *shrink, struct shrink_control *sc)
{
struct kc_shrinker_funcs *funcs = KC_SHRINKER_FUNCS(shrink);
unsigned long nr;
if (sc->nr_to_scan != 0)
funcs->scan_objects(shrink, sc);
nr = funcs->count_objects(shrink, sc);
return min_t(unsigned long, nr, INT_MAX);
}
#endif

View File

@@ -46,4 +46,81 @@ static inline int dir_emit_dots(struct file *file, void *dirent,
}
#endif
#ifndef KC_DIR_EMIT_DOTS
#define percpu_counter_add_batch __percpu_counter_add
#endif
#ifndef KC_MEMALLOC_NOFS_SAVE
#define memalloc_nofs_save memalloc_noio_save
#define memalloc_nofs_restore memalloc_noio_restore
#endif
#ifdef KC_BIO_BI_OPF
#define kc_bio_get_opf(bio) \
({ \
(bio)->bi_opf; \
})
#define kc_bio_set_opf(bio, opf) \
do { \
(bio)->bi_opf = opf; \
} while (0)
#define kc_bio_set_sector(bio, sect) \
do { \
(bio)->bi_iter.bi_sector = sect;\
} while (0)
#else
#define kc_bio_get_opf(bio) \
({ \
(bio)->bi_rw; \
})
#define kc_bio_set_opf(bio, opf) \
do { \
(bio)->bio_rw = opf; \
} while (0)
#define kc_bio_set_sector(bio, sect) \
do { \
(bio)->bi_sector = sect; \
} while (0)
#endif
#ifdef KC_BIO_BI_STATUS
#define KC_DECLARE_BIO_END_IO(name, bio) name(bio)
#define kc_bio_get_errno(bio) ({ blk_status_to_errno((bio)->bi_status); })
#else
#define KC_DECLARE_BIO_END_IO(name, bio) name(bio, int _error_arg)
#define kc_bio_get_errno(bio) ({ (int)((void)(bio), _error_arg); })
#endif
#ifndef KC_SHRINKER_SHRINK
#define KC_DEFINE_SHRINKER(name) struct shrinker name
#define KC_INIT_SHRINKER_FUNCS(type, name, shrink, count, scan) do { \
__typeof__(shrink) _shrink = (shrink); \
_shrink->count_objects = count; \
_shrink->scan_objects = scan; \
} while (0)
#else
#include <linux/shrinker.h>
struct kc_shrinker_funcs {
unsigned long (*count_objects)(struct shrinker *, struct shrink_control *sc);
unsigned long (*scan_objects)(struct shrinker *, struct shrink_control *sc);
};
/* using adjacent member of an unnamed struct */
#define KC_DEFINE_SHRINKER(name) \
{ \
struct kc_shrinker_funcs shrinker_funcs; \
struct shinker name; \
}
#define KC_SHRINKER_FUNCS(shrinker) \
((void *)((long)(shrink) - sizeof(struct kc_shrinker_funcs)))
#define KC_INIT_SHRINKER_FUNCS(type, name, shrink, count, scan) do { \
BUILD_BUG_ON(offsetof(cont, shrink_funcs) + sizeof(struct kc_shrinker_funcs)) != \
offsetof(cont, name) + sizeof(struct kc_shrinker_funcs); \
struct kc_shrinker_funcs *_funcs = KC_SHRINKER_FUNCS(shrink) \
__typeof__(shrink) _shrink = (shrink); \
_funcs->count_objects = count; \
_funcs->scan_objects = scan; \
_shrink->shrink = kc_shrink_wrapper; \
} while (0)
#endif
#endif

View File

@@ -694,13 +694,13 @@ static int alloc_move_refill_zoned(struct super_block *sb, struct scoutfs_alloc_
static int alloc_move_empty(struct super_block *sb,
struct scoutfs_alloc_root *dst,
struct scoutfs_alloc_root *src, u64 meta_budget)
struct scoutfs_alloc_root *src, u64 meta_reserved)
{
DECLARE_SERVER_INFO(sb, server);
return scoutfs_alloc_move(sb, &server->alloc, &server->wri,
dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0,
meta_budget);
meta_reserved);
}
/*
@@ -1226,82 +1226,6 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
return ret;
}
/*
* The calling get_log_trees ran out of available blocks in its commit's
* metadata allocator while moving extents from the log tree's
* data_freed into the core data_avail. This finishes moving the
* extents in as many additional commits as it takes. The logs mutex
* is nested inside holding commits so we recheck the persistent item
* each time we commit to make sure it's still what we think. The
* caller is still going to send the item to the client so we update the
* caller's each time we make progress. This is a best-effort attempt
* to clean up and it's valid to leave extents in data_freed we don't
* return errors to the caller. The client will continue the work later
* in get_log_trees or as the rid is reclaimed.
*/
static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
{
DECLARE_SERVER_INFO(sb, server);
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
const u64 rid = le64_to_cpu(lt->rid);
const u64 nr = le64_to_cpu(lt->nr);
struct scoutfs_log_trees drain;
struct scoutfs_key key;
COMMIT_HOLD(hold);
int ret = 0;
int err;
scoutfs_key_init_log_trees(&key, rid, nr);
while (lt->data_freed.total_len != 0) {
server_hold_commit(sb, &hold);
mutex_lock(&server->logs_mutex);
ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX, &drain);
if (ret < 0)
break;
/* careful to only keep draining the caller's specific open trans */
if (drain.nr != lt->nr || drain.get_trans_seq != lt->get_trans_seq ||
drain.commit_trans_seq != lt->commit_trans_seq || drain.flags != lt->flags) {
ret = -ENOENT;
break;
}
ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri,
&super->logs_root, &key);
if (ret < 0)
break;
/* moving can modify and return errors, always update caller and item */
mutex_lock(&server->alloc_mutex);
ret = alloc_move_empty(sb, &super->data_alloc, &drain.data_freed,
COMMIT_HOLD_ALLOC_BUDGET / 2);
mutex_unlock(&server->alloc_mutex);
if (ret == -EINPROGRESS)
ret = 0;
*lt = drain;
err = scoutfs_btree_force(sb, &server->alloc, &server->wri,
&super->logs_root, &key, &drain, sizeof(drain));
BUG_ON(err < 0); /* dirtying must guarantee success */
mutex_unlock(&server->logs_mutex);
ret = server_apply_commit(sb, &hold, ret);
if (ret < 0) {
ret = 0; /* don't try to abort, ignoring ret */
break;
}
}
/* try to cleanly abort and write any partial dirty btree blocks, but ignore result */
if (ret < 0) {
mutex_unlock(&server->logs_mutex);
server_apply_commit(sb, &hold, 0);
}
}
/*
* Give the client roots to all the trees that they'll use to build
* their transaction.
@@ -1427,9 +1351,7 @@ static int server_get_log_trees(struct super_block *sb,
goto update;
}
ret = alloc_move_empty(sb, &super->data_alloc, &lt.data_freed, 100);
if (ret == -EINPROGRESS)
ret = 0;
ret = alloc_move_empty(sb, &super->data_alloc, &lt.data_freed, 0);
if (ret < 0) {
err_str = "emptying committed data_freed";
goto update;
@@ -1507,10 +1429,6 @@ out:
scoutfs_err(sb, "error %d getting log trees for rid %016llx: %s",
ret, rid, err_str);
/* try to drain excessive data_freed with additional commits, if needed, ignoring err */
if (ret == 0)
try_drain_data_freed(sb, &lt);
return scoutfs_net_response(sb, conn, cmd, id, ret, &lt, sizeof(lt));
}

View File

@@ -10,8 +10,7 @@ BIN := src/createmany \
src/bulk_create_paths \
src/stage_tmpfile \
src/find_xattrs \
src/create_xattr_loop \
src/fragmented_data_extents
src/create_xattr_loop
DEPS := $(wildcard src/*.d)

View File

@@ -1,3 +0,0 @@
== creating fragmented extents
== unlink file with moved extents to free extents per block
== cleanup

View File

@@ -9,7 +9,6 @@ fallocate.sh
setattr_more.sh
offline-extent-waiting.sh
move-blocks.sh
large-fragmented-free.sh
enospc.sh
srch-basic-functionality.sh
simple-xattr-unit.sh

View File

@@ -1,113 +0,0 @@
/*
* Copyright (C) 2021 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
/*
* This creates fragmented data extents.
*
* A file is created that has alternating free and allocated extents.
* This also results in the global allocator having the matching
* fragmented free extent pattern. While that file is being created,
* occasionally an allocated extent is moved to another file. This
* results in a file that has fragmented extents at a given stride that
* can be deleted to create free data extents with a given stride.
*
* We don't have hole punching so to do this quickly we use a goofy
* combination of fallocate, truncate, and our move_blocks ioctl.
*/
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include <errno.h>
#include <linux/types.h>
#include <assert.h>
#include "ioctl.h"
#define BLOCK_SIZE 4096
int main(int argc, char **argv)
{
struct scoutfs_ioctl_move_blocks mb = {0,};
unsigned long long freed_extents;
unsigned long long move_stride;
unsigned long long i;
int alloc_fd;
int trunc_fd;
off_t off;
int ret;
if (argc != 5) {
printf("%s <freed_extents> <move_stride> <alloc_file> <trunc_file>\n", argv[0]);
return 1;
}
freed_extents = strtoull(argv[1], NULL, 0);
move_stride = strtoull(argv[2], NULL, 0);
alloc_fd = open(argv[3], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
if (alloc_fd == -1) {
fprintf(stderr, "error opening %s: %d (%s)\n", argv[3], errno, strerror(errno));
exit(1);
}
trunc_fd = open(argv[4], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
if (trunc_fd == -1) {
fprintf(stderr, "error opening %s: %d (%s)\n", argv[4], errno, strerror(errno));
exit(1);
}
for (i = 0, off = 0; i < freed_extents; i++, off += BLOCK_SIZE * 2) {
ret = fallocate(alloc_fd, 0, off, BLOCK_SIZE * 2);
if (ret < 0) {
fprintf(stderr, "fallocate at off %llu error: %d (%s)\n",
(unsigned long long)off, errno, strerror(errno));
exit(1);
}
ret = ftruncate(alloc_fd, off + BLOCK_SIZE);
if (ret < 0) {
fprintf(stderr, "truncate to off %llu error: %d (%s)\n",
(unsigned long long)off + BLOCK_SIZE, errno, strerror(errno));
exit(1);
}
if ((i % move_stride) == 0) {
mb.from_fd = alloc_fd;
mb.from_off = off;
mb.len = BLOCK_SIZE;
mb.to_off = i * BLOCK_SIZE;
ret = ioctl(trunc_fd, SCOUTFS_IOC_MOVE_BLOCKS, &mb);
if (ret < 0) {
fprintf(stderr, "move from off %llu error: %d (%s)\n",
(unsigned long long)off,
errno, strerror(errno));
}
}
}
if (alloc_fd > -1)
close(alloc_fd);
if (trunc_fd > -1)
close(trunc_fd);
return 0;
}

View File

@@ -1,22 +0,0 @@
#
# Make sure the server can handle a transaction with a data_freed whose
# blocks all hit different btree blocks in the main free list. It
# probably has to be merged in multiple commits.
#
t_require_commands fragmented_data_extents
EXTENTS_PER_BTREE_BLOCK=600
EXTENTS_PER_LIST_BLOCK=8192
FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))
echo "== creating fragmented extents"
fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"
echo "== unlink file with moved extents to free extents per block"
rm -f "$T_D0/move"
echo "== cleanup"
rm -f "$T_D0/alloc"
t_pass