mirror of
https://github.com/versity/scoutfs.git
synced 2026-01-10 05:37:25 +00:00
Compare commits
7 Commits
zab/rhel8_
...
v1.7
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f13aba78b1 | ||
|
|
3220c2055c | ||
|
|
1cbc927ccb | ||
|
|
acb94dd9b7 | ||
|
|
233fbb39f3 | ||
|
|
198d3cda32 | ||
|
|
e8c64b4217 |
@@ -1,6 +1,29 @@
|
||||
Versity ScoutFS Release Notes
|
||||
=============================
|
||||
|
||||
---
|
||||
v1.7
|
||||
\
|
||||
*Aug 26, 2022*
|
||||
|
||||
* **Fixed possible persistent errors moving freed data extents**
|
||||
\
|
||||
Fixed a case where the server could hit persistent errors trying to
|
||||
move a client's freed extents in one commit. The client had to free
|
||||
a large number of extents that occupied distant positions in the
|
||||
global free extent btree. Very large fragmented files could cause
|
||||
this. The server now moves the freed extents in multiple commits and
|
||||
can always ensure forward progress.
|
||||
|
||||
* **Fixed possible persistent errors from freed duplicate extents**
|
||||
\
|
||||
Background orphan deletion wasn't properly synchronizing with
|
||||
foreground tasks deleting very large files. If a deletion took long
|
||||
enough then background deletion could also attempt to delete inode items
|
||||
while the deletion was making progress. This could create duplicate
|
||||
deletions of data extent items which causes the server to abort when
|
||||
it later discovers the duplicate extents as it merges free lists.
|
||||
|
||||
---
|
||||
v1.6
|
||||
\
|
||||
|
||||
@@ -46,10 +46,6 @@ scoutfs-y += \
|
||||
volopt.o \
|
||||
xattr.o
|
||||
|
||||
ifdef KC_BUILD_KERNELCOMPAT
|
||||
scoutfs-y += kernelcompat.o
|
||||
endif
|
||||
|
||||
#
|
||||
# The raw types aren't available in userspace headers. Make sure all
|
||||
# the types we use in the headers are the exported __ versions.
|
||||
|
||||
@@ -34,52 +34,3 @@ endif
|
||||
ifneq (,$(shell grep 'FMODE_KABI_ITERATE' include/linux/fs.h))
|
||||
ccflags-y += -DKC_FMODE_KABI_ITERATE
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.11-12447-g104b4e5139fe
|
||||
#
|
||||
# Renamed __percpu_counter_add to percpu_counter_add_batch to clarify
|
||||
# that the __ wasn't less safe, just took an extra parameter.
|
||||
#
|
||||
ifneq (,$(shell grep 'percpu_counter_add_batch' include/linux/percpu_counter.h))
|
||||
ccflags-y += -DKC_PERCPU_COUNTER_ADD_BATCH
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.11-4550-g7dea19f9ee63
|
||||
#
|
||||
# Introduced memalloc_nofs_{save,restore} preferred instead of _noio_.
|
||||
#
|
||||
ifneq (,$(shell grep 'memalloc_nofs_save' include/linux/sched/mm.h))
|
||||
ccflags-y += -DKC_MEMALLOC_NOFS_SAVE
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.7-12414-g1eff9d322a44
|
||||
#
|
||||
# Renamed bi_rw to bi_opf to force old code to catch up. We use it as a
|
||||
# single switch between old and new bio structures.
|
||||
#
|
||||
ifneq (,$(shell grep 'bi_opf' include/linux/blk_types.h))
|
||||
ccflags-y += -DKC_BIO_BI_OPF
|
||||
endif
|
||||
|
||||
#
|
||||
# v4.12-rc2-201-g4e4cbee93d56
|
||||
#
|
||||
# Moves to bi_status BLK_STS_ API instead of having a mix of error
|
||||
# end_io args or bi_error.
|
||||
#
|
||||
ifneq (,$(shell grep 'bi_status' include/linux/blk_types.h))
|
||||
ccflags-y += -DKC_BIO_BI_STATUS
|
||||
endif
|
||||
|
||||
#
|
||||
# v3.11-8765-ga0b02131c5fc
|
||||
#
|
||||
# Remove the old ->shrink() API, ->{scan,count}_objects is preferred.
|
||||
#
|
||||
ifneq (,$(shell grep '(*shrink)' include/linux/shrinker.h))
|
||||
ccflags-y += -DKC_SHRINKER_SHRINK
|
||||
KC_BUILD_KERNELCOMPAT=1
|
||||
endif
|
||||
|
||||
@@ -892,12 +892,11 @@ static int find_zone_extent(struct super_block *sb, struct scoutfs_alloc_root *r
|
||||
* -ENOENT is returned if we run out of extents in the source tree
|
||||
* before moving the total.
|
||||
*
|
||||
* If meta_reserved is non-zero then -EINPROGRESS can be returned if the
|
||||
* current meta allocator's avail blocks or room for freed blocks would
|
||||
* have fallen under the reserved amount. The could have been
|
||||
* successfully dirtied in this case but the number of blocks moved is
|
||||
* not returned. The caller is expected to deal with the partial
|
||||
* progress by commiting the dirty trees and examining the resulting
|
||||
* If meta_budget is non-zero then -EINPROGRESS can be returned if the
|
||||
* the caller's budget is consumed in the allocator during this call
|
||||
* (though not necessarily by us, we don't have per-thread tracking of
|
||||
* allocator consumption :/). The call can still have made progress and
|
||||
* caller is expected commit the dirty trees and examining the resulting
|
||||
* modified trees to see if they need to continue moving extents.
|
||||
*
|
||||
* The caller can specify that extents in the source tree should first
|
||||
@@ -914,7 +913,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 total,
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved)
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_budget)
|
||||
{
|
||||
struct alloc_ext_args args = {
|
||||
.alloc = alloc,
|
||||
@@ -922,6 +921,8 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
};
|
||||
struct scoutfs_extent found;
|
||||
struct scoutfs_extent ext;
|
||||
u32 avail_start = 0;
|
||||
u32 freed_start = 0;
|
||||
u64 moved = 0;
|
||||
u64 count;
|
||||
int ret = 0;
|
||||
@@ -932,6 +933,9 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
vacant = NULL;
|
||||
}
|
||||
|
||||
if (meta_budget != 0)
|
||||
scoutfs_alloc_meta_remaining(alloc, &avail_start, &freed_start);
|
||||
|
||||
while (moved < total) {
|
||||
count = total - moved;
|
||||
|
||||
@@ -964,10 +968,10 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
if (meta_reserved != 0 &&
|
||||
scoutfs_alloc_meta_low(sb, alloc, meta_reserved +
|
||||
extent_mod_blocks(src->root.height) +
|
||||
extent_mod_blocks(dst->root.height))) {
|
||||
if (meta_budget != 0 &&
|
||||
scoutfs_alloc_meta_low_since(alloc, avail_start, freed_start, meta_budget,
|
||||
extent_mod_blocks(src->root.height) +
|
||||
extent_mod_blocks(dst->root.height))) {
|
||||
ret = -EINPROGRESS;
|
||||
break;
|
||||
}
|
||||
@@ -1351,6 +1355,27 @@ void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total,
|
||||
} while (read_seqretry(&alloc->seqlock, seq));
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if the caller's consumption of nr from either avail or
|
||||
* freed would end up exceeding their budget relative to the starting
|
||||
* remaining snapshot they took.
|
||||
*/
|
||||
bool scoutfs_alloc_meta_low_since(struct scoutfs_alloc *alloc, u32 avail_start, u32 freed_start,
|
||||
u32 budget, u32 nr)
|
||||
{
|
||||
u32 avail_use;
|
||||
u32 freed_use;
|
||||
u32 avail;
|
||||
u32 freed;
|
||||
|
||||
scoutfs_alloc_meta_remaining(alloc, &avail, &freed);
|
||||
|
||||
avail_use = avail_start - avail;
|
||||
freed_use = freed_start - freed;
|
||||
|
||||
return ((avail_use + nr) > budget) || ((freed_use + nr) > budget);
|
||||
}
|
||||
|
||||
bool scoutfs_alloc_test_flag(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 flag)
|
||||
{
|
||||
|
||||
@@ -131,7 +131,7 @@ int scoutfs_alloc_move(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 total,
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_reserved);
|
||||
__le64 *exclusive, __le64 *vacant, u64 zone_blocks, u64 meta_budget);
|
||||
int scoutfs_alloc_insert(struct super_block *sb, struct scoutfs_alloc *alloc,
|
||||
struct scoutfs_block_writer *wri, struct scoutfs_alloc_root *root,
|
||||
u64 start, u64 len);
|
||||
@@ -159,6 +159,8 @@ int scoutfs_alloc_splice_list(struct super_block *sb,
|
||||
bool scoutfs_alloc_meta_low(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 nr);
|
||||
void scoutfs_alloc_meta_remaining(struct scoutfs_alloc *alloc, u32 *avail_total, u32 *freed_space);
|
||||
bool scoutfs_alloc_meta_low_since(struct scoutfs_alloc *alloc, u32 avail_start, u32 freed_start,
|
||||
u32 budget, u32 nr);
|
||||
bool scoutfs_alloc_test_flag(struct super_block *sb,
|
||||
struct scoutfs_alloc *alloc, u32 flag);
|
||||
|
||||
|
||||
@@ -21,7 +21,6 @@
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/rhashtable.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
#include "format.h"
|
||||
#include "super.h"
|
||||
@@ -58,7 +57,7 @@ struct block_info {
|
||||
atomic64_t access_counter;
|
||||
struct rhashtable ht;
|
||||
wait_queue_head_t waitq;
|
||||
KC_DEFINE_SHRINKER(shrinker);
|
||||
struct shrinker shrinker;
|
||||
struct work_struct free_work;
|
||||
struct llist_head free_llist;
|
||||
};
|
||||
@@ -129,7 +128,7 @@ static __le32 block_calc_crc(struct scoutfs_block_header *hdr, u32 size)
|
||||
static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
|
||||
{
|
||||
struct block_private *bp;
|
||||
unsigned int nofs_flags;
|
||||
unsigned int noio_flags;
|
||||
|
||||
/*
|
||||
* If we had multiple blocks per page we'd need to be a little
|
||||
@@ -157,9 +156,9 @@ static struct block_private *block_alloc(struct super_block *sb, u64 blkno)
|
||||
* spurious reclaim-on dependencies and warnings.
|
||||
*/
|
||||
lockdep_off();
|
||||
nofs_flags = memalloc_nofs_save();
|
||||
noio_flags = memalloc_noio_save();
|
||||
bp->virt = __vmalloc(SCOUTFS_BLOCK_LG_SIZE, GFP_NOFS | __GFP_HIGHMEM, PAGE_KERNEL);
|
||||
memalloc_nofs_restore(nofs_flags);
|
||||
memalloc_noio_restore(noio_flags);
|
||||
lockdep_on();
|
||||
|
||||
if (!bp->virt) {
|
||||
@@ -437,10 +436,11 @@ static void block_remove_all(struct super_block *sb)
|
||||
* possible. Final freeing, verifying checksums, and unlinking errored
|
||||
* blocks are all done by future users of the blocks.
|
||||
*/
|
||||
static void block_end_io(struct super_block *sb, unsigned int opf,
|
||||
static void block_end_io(struct super_block *sb, int rw,
|
||||
struct block_private *bp, int err)
|
||||
{
|
||||
DECLARE_BLOCK_INFO(sb, binf);
|
||||
bool is_read = !(rw & WRITE);
|
||||
|
||||
if (err) {
|
||||
scoutfs_inc_counter(sb, block_cache_end_io_error);
|
||||
@@ -450,7 +450,7 @@ static void block_end_io(struct super_block *sb, unsigned int opf,
|
||||
if (!atomic_dec_and_test(&bp->io_count))
|
||||
return;
|
||||
|
||||
if (!op_is_write(opf) && !test_bit(BLOCK_BIT_ERROR, &bp->bits))
|
||||
if (is_read && !test_bit(BLOCK_BIT_ERROR, &bp->bits))
|
||||
set_bit(BLOCK_BIT_UPTODATE, &bp->bits);
|
||||
|
||||
clear_bit(BLOCK_BIT_IO_BUSY, &bp->bits);
|
||||
@@ -463,13 +463,13 @@ static void block_end_io(struct super_block *sb, unsigned int opf,
|
||||
wake_up(&binf->waitq);
|
||||
}
|
||||
|
||||
static void KC_DECLARE_BIO_END_IO(block_bio_end_io, struct bio *bio)
|
||||
static void block_bio_end_io(struct bio *bio, int err)
|
||||
{
|
||||
struct block_private *bp = bio->bi_private;
|
||||
struct super_block *sb = bp->sb;
|
||||
|
||||
TRACE_BLOCK(end_io, bp);
|
||||
block_end_io(sb, kc_bio_get_opf(bio), bp, kc_bio_get_errno(bio));
|
||||
block_end_io(sb, bio->bi_rw, bp, err);
|
||||
bio_put(bio);
|
||||
}
|
||||
|
||||
@@ -477,7 +477,7 @@ static void KC_DECLARE_BIO_END_IO(block_bio_end_io, struct bio *bio)
|
||||
* Kick off IO for a single block.
|
||||
*/
|
||||
static int block_submit_bio(struct super_block *sb, struct block_private *bp,
|
||||
unsigned int opf)
|
||||
int rw)
|
||||
{
|
||||
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
|
||||
struct bio *bio = NULL;
|
||||
@@ -510,9 +510,8 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
|
||||
break;
|
||||
}
|
||||
|
||||
kc_bio_set_opf(bio, opf);
|
||||
kc_bio_set_sector(bio, sector + (off >> 9));
|
||||
bio_set_dev(bio, sbi->meta_bdev);
|
||||
bio->bi_sector = sector + (off >> 9);
|
||||
bio->bi_bdev = sbi->meta_bdev;
|
||||
bio->bi_end_io = block_bio_end_io;
|
||||
bio->bi_private = bp;
|
||||
|
||||
@@ -529,18 +528,18 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
|
||||
BUG();
|
||||
|
||||
if (!bio_add_page(bio, page, PAGE_SIZE, 0)) {
|
||||
submit_bio(bio);
|
||||
submit_bio(rw, bio);
|
||||
bio = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (bio)
|
||||
submit_bio(bio);
|
||||
submit_bio(rw, bio);
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
|
||||
/* let racing end_io know we're done */
|
||||
block_end_io(sb, opf, bp, ret);
|
||||
block_end_io(sb, rw, bp, ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -641,7 +640,7 @@ static struct block_private *block_read(struct super_block *sb, u64 blkno)
|
||||
|
||||
if (!test_bit(BLOCK_BIT_UPTODATE, &bp->bits) &&
|
||||
test_and_clear_bit(BLOCK_BIT_NEW, &bp->bits)) {
|
||||
ret = block_submit_bio(sb, bp, REQ_OP_READ);
|
||||
ret = block_submit_bio(sb, bp, READ);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
}
|
||||
@@ -940,7 +939,7 @@ int scoutfs_block_writer_write(struct super_block *sb,
|
||||
/* retry previous write errors */
|
||||
clear_bit(BLOCK_BIT_ERROR, &bp->bits);
|
||||
|
||||
ret = block_submit_bio(sb, bp, REQ_OP_WRITE);
|
||||
ret = block_submit_bio(sb, bp, WRITE);
|
||||
if (ret < 0)
|
||||
break;
|
||||
}
|
||||
@@ -1040,17 +1039,6 @@ u64 scoutfs_block_writer_dirty_bytes(struct super_block *sb,
|
||||
return wri->nr_dirty_blocks * SCOUTFS_BLOCK_LG_SIZE;
|
||||
}
|
||||
|
||||
static unsigned long block_count_objects(struct shrinker *shrink, struct shrink_control *sc)
|
||||
{
|
||||
struct block_info *binf = container_of(shrink, struct block_info, shrinker);
|
||||
struct super_block *sb = binf->sb;
|
||||
|
||||
scoutfs_inc_counter(sb, block_cache_scan_objects);
|
||||
|
||||
return min_t(u64, (u64)atomic_read(&binf->total_inserted) * SCOUTFS_BLOCK_LG_PAGES_PER,
|
||||
ULONG_MAX / 2); /* magic numbers as we approach ~0UL :/ */
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove a number of cached blocks that haven't been used recently.
|
||||
*
|
||||
@@ -1071,19 +1059,23 @@ static unsigned long block_count_objects(struct shrinker *shrink, struct shrink_
|
||||
* atomically remove blocks when the only references are ours and the
|
||||
* hash table.
|
||||
*/
|
||||
static unsigned long block_scan_objects(struct shrinker *shrink, struct shrink_control *sc)
|
||||
static int block_shrink(struct shrinker *shrink, struct shrink_control *sc)
|
||||
{
|
||||
struct block_info *binf = container_of(shrink, struct block_info, shrinker);
|
||||
struct block_info *binf = container_of(shrink, struct block_info,
|
||||
shrinker);
|
||||
struct super_block *sb = binf->sb;
|
||||
struct rhashtable_iter iter;
|
||||
struct block_private *bp;
|
||||
unsigned long freed = 0;
|
||||
unsigned long nr;
|
||||
u64 recently;
|
||||
|
||||
scoutfs_inc_counter(sb, block_cache_scan_objects);
|
||||
nr = sc->nr_to_scan;
|
||||
if (nr == 0)
|
||||
goto out;
|
||||
|
||||
nr = DIV_ROUND_UP(sc->nr_to_scan, SCOUTFS_BLOCK_LG_PAGES_PER);
|
||||
scoutfs_inc_counter(sb, block_cache_shrink);
|
||||
|
||||
nr = DIV_ROUND_UP(nr, SCOUTFS_BLOCK_LG_PAGES_PER);
|
||||
|
||||
restart:
|
||||
recently = accessed_recently(binf);
|
||||
@@ -1126,7 +1118,6 @@ restart:
|
||||
if (block_remove_solo(sb, bp)) {
|
||||
scoutfs_inc_counter(sb, block_cache_shrink_remove);
|
||||
TRACE_BLOCK(shrink, bp);
|
||||
freed++;
|
||||
nr--;
|
||||
}
|
||||
block_put(sb, bp);
|
||||
@@ -1135,8 +1126,9 @@ restart:
|
||||
|
||||
rhashtable_walk_stop(&iter);
|
||||
rhashtable_walk_exit(&iter);
|
||||
|
||||
return freed;
|
||||
out:
|
||||
return min_t(u64, (u64)atomic_read(&binf->total_inserted) * SCOUTFS_BLOCK_LG_PAGES_PER,
|
||||
INT_MAX);
|
||||
}
|
||||
|
||||
struct sm_block_completion {
|
||||
@@ -1144,11 +1136,11 @@ struct sm_block_completion {
|
||||
int err;
|
||||
};
|
||||
|
||||
static void KC_DECLARE_BIO_END_IO(sm_block_bio_end_io, struct bio *bio)
|
||||
static void sm_block_bio_end_io(struct bio *bio, int err)
|
||||
{
|
||||
struct sm_block_completion *sbc = bio->bi_private;
|
||||
|
||||
sbc->err = kc_bio_get_errno(bio);
|
||||
sbc->err = err;
|
||||
complete(&sbc->comp);
|
||||
bio_put(bio);
|
||||
}
|
||||
@@ -1163,8 +1155,9 @@ static void KC_DECLARE_BIO_END_IO(sm_block_bio_end_io, struct bio *bio)
|
||||
* only layer that sees the full block buffer so we pass the calculated
|
||||
* crc to the caller for them to check in their context.
|
||||
*/
|
||||
static int sm_block_io(struct super_block *sb, struct block_device *bdev, unsigned int opf,
|
||||
u64 blkno, struct scoutfs_block_header *hdr, size_t len, __le32 *blk_crc)
|
||||
static int sm_block_io(struct super_block *sb, struct block_device *bdev, int rw, u64 blkno,
|
||||
struct scoutfs_block_header *hdr, size_t len,
|
||||
__le32 *blk_crc)
|
||||
{
|
||||
struct scoutfs_block_header *pg_hdr;
|
||||
struct sm_block_completion sbc;
|
||||
@@ -1178,7 +1171,7 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, unsign
|
||||
return -EIO;
|
||||
|
||||
if (WARN_ON_ONCE(len > SCOUTFS_BLOCK_SM_SIZE) ||
|
||||
WARN_ON_ONCE(!op_is_write(opf) && !blk_crc))
|
||||
WARN_ON_ONCE(!(rw & WRITE) && !blk_crc))
|
||||
return -EINVAL;
|
||||
|
||||
page = alloc_page(GFP_NOFS);
|
||||
@@ -1187,7 +1180,7 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, unsign
|
||||
|
||||
pg_hdr = page_address(page);
|
||||
|
||||
if (op_is_write(opf)) {
|
||||
if (rw & WRITE) {
|
||||
memcpy(pg_hdr, hdr, len);
|
||||
if (len < SCOUTFS_BLOCK_SM_SIZE)
|
||||
memset((char *)pg_hdr + len, 0,
|
||||
@@ -1201,9 +1194,8 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, unsign
|
||||
goto out;
|
||||
}
|
||||
|
||||
bio->bi_opf = opf | REQ_SYNC;
|
||||
kc_bio_set_sector(bio, blkno << (SCOUTFS_BLOCK_SM_SHIFT - 9));
|
||||
bio_set_dev(bio, bdev);
|
||||
bio->bi_sector = blkno << (SCOUTFS_BLOCK_SM_SHIFT - 9);
|
||||
bio->bi_bdev = bdev;
|
||||
bio->bi_end_io = sm_block_bio_end_io;
|
||||
bio->bi_private = &sbc;
|
||||
bio_add_page(bio, page, SCOUTFS_BLOCK_SM_SIZE, 0);
|
||||
@@ -1211,12 +1203,12 @@ static int sm_block_io(struct super_block *sb, struct block_device *bdev, unsign
|
||||
init_completion(&sbc.comp);
|
||||
sbc.err = 0;
|
||||
|
||||
submit_bio(bio);
|
||||
submit_bio((rw & WRITE) ? WRITE_SYNC : READ_SYNC, bio);
|
||||
|
||||
wait_for_completion(&sbc.comp);
|
||||
ret = sbc.err;
|
||||
|
||||
if (ret == 0 && !op_is_write(opf)) {
|
||||
if (ret == 0 && !(rw & WRITE)) {
|
||||
memcpy(hdr, pg_hdr, len);
|
||||
*blk_crc = block_calc_crc(pg_hdr, SCOUTFS_BLOCK_SM_SIZE);
|
||||
}
|
||||
@@ -1230,14 +1222,14 @@ int scoutfs_block_read_sm(struct super_block *sb,
|
||||
struct scoutfs_block_header *hdr, size_t len,
|
||||
__le32 *blk_crc)
|
||||
{
|
||||
return sm_block_io(sb, bdev, REQ_OP_READ, blkno, hdr, len, blk_crc);
|
||||
return sm_block_io(sb, bdev, READ, blkno, hdr, len, blk_crc);
|
||||
}
|
||||
|
||||
int scoutfs_block_write_sm(struct super_block *sb,
|
||||
struct block_device *bdev, u64 blkno,
|
||||
struct scoutfs_block_header *hdr, size_t len)
|
||||
{
|
||||
return sm_block_io(sb, bdev, REQ_OP_WRITE, blkno, hdr, len, NULL);
|
||||
return sm_block_io(sb, bdev, WRITE, blkno, hdr, len, NULL);
|
||||
}
|
||||
|
||||
int scoutfs_block_setup(struct super_block *sb)
|
||||
@@ -1262,8 +1254,7 @@ int scoutfs_block_setup(struct super_block *sb)
|
||||
atomic_set(&binf->total_inserted, 0);
|
||||
atomic64_set(&binf->access_counter, 0);
|
||||
init_waitqueue_head(&binf->waitq);
|
||||
KC_INIT_SHRINKER_FUNCS(struct block_info, shrinker,
|
||||
&binf->shrinker, block_count_objects, block_scan_objects);
|
||||
binf->shrinker.shrink = block_shrink;
|
||||
binf->shrinker.seeks = DEFAULT_SEEKS;
|
||||
register_shrinker(&binf->shrinker);
|
||||
INIT_WORK(&binf->free_work, block_free_work);
|
||||
|
||||
@@ -30,8 +30,6 @@
|
||||
EXPAND_COUNTER(block_cache_free) \
|
||||
EXPAND_COUNTER(block_cache_free_work) \
|
||||
EXPAND_COUNTER(block_cache_remove_stale) \
|
||||
EXPAND_COUNTER(block_cache_count_objects) \
|
||||
EXPAND_COUNTER(block_cache_scan_objects) \
|
||||
EXPAND_COUNTER(block_cache_shrink) \
|
||||
EXPAND_COUNTER(block_cache_shrink_next) \
|
||||
EXPAND_COUNTER(block_cache_shrink_recent) \
|
||||
@@ -237,12 +235,12 @@ struct scoutfs_counters {
|
||||
#define SCOUTFS_PCPU_COUNTER_BATCH (1 << 30)
|
||||
|
||||
#define scoutfs_inc_counter(sb, which) \
|
||||
percpu_counter_add_batch(&SCOUTFS_SB(sb)->counters->which, 1, \
|
||||
SCOUTFS_PCPU_COUNTER_BATCH)
|
||||
__percpu_counter_add(&SCOUTFS_SB(sb)->counters->which, 1, \
|
||||
SCOUTFS_PCPU_COUNTER_BATCH)
|
||||
|
||||
#define scoutfs_add_counter(sb, which, cnt) \
|
||||
percpu_counter_add_batch(&SCOUTFS_SB(sb)->counters->which, cnt, \
|
||||
SCOUTFS_PCPU_COUNTER_BATCH)
|
||||
__percpu_counter_add(&SCOUTFS_SB(sb)->counters->which, cnt, \
|
||||
SCOUTFS_PCPU_COUNTER_BATCH)
|
||||
|
||||
void __init scoutfs_init_counters(void);
|
||||
int scoutfs_setup_counters(struct super_block *sb);
|
||||
|
||||
@@ -1685,6 +1685,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
|
||||
struct scoutfs_lock *lock = NULL;
|
||||
struct scoutfs_inode sinode;
|
||||
struct scoutfs_key key;
|
||||
bool clear_trying = false;
|
||||
u64 group_nr;
|
||||
int bit_nr;
|
||||
int ret;
|
||||
@@ -1704,6 +1705,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
|
||||
ret = 0;
|
||||
goto out;
|
||||
}
|
||||
clear_trying = true;
|
||||
|
||||
/* can't delete if it's cached in local or remote mounts */
|
||||
if (scoutfs_omap_test(sb, ino) || test_bit_le(bit_nr, ldata->map.bits)) {
|
||||
@@ -1730,7 +1732,7 @@ static int try_delete_inode_items(struct super_block *sb, u64 ino)
|
||||
|
||||
ret = delete_inode_items(sb, ino, &sinode, lock, orph_lock);
|
||||
out:
|
||||
if (ldata)
|
||||
if (clear_trying)
|
||||
clear_bit(bit_nr, ldata->trying);
|
||||
|
||||
scoutfs_unlock(sb, lock, SCOUTFS_LOCK_WRITE);
|
||||
|
||||
@@ -1,23 +0,0 @@
|
||||
|
||||
#include "kernelcompat.h"
|
||||
|
||||
#ifdef KC_SHRINKER_SHRINK
|
||||
#include <linux/shrinker.h>
|
||||
/*
|
||||
* If a target doesn't have that .{count,scan}_objects() interface then
|
||||
* we have a .shrink() helper that performs the shrink work in terms of
|
||||
* count/scan.
|
||||
*/
|
||||
int kc_shrink_wrapper(struct shrinker *shrink, struct shrink_control *sc)
|
||||
{
|
||||
struct kc_shrinker_funcs *funcs = KC_SHRINKER_FUNCS(shrink);
|
||||
unsigned long nr;
|
||||
|
||||
if (sc->nr_to_scan != 0)
|
||||
funcs->scan_objects(shrink, sc);
|
||||
|
||||
nr = funcs->count_objects(shrink, sc);
|
||||
|
||||
return min_t(unsigned long, nr, INT_MAX);
|
||||
}
|
||||
#endif
|
||||
@@ -46,81 +46,4 @@ static inline int dir_emit_dots(struct file *file, void *dirent,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef KC_DIR_EMIT_DOTS
|
||||
#define percpu_counter_add_batch __percpu_counter_add
|
||||
#endif
|
||||
|
||||
#ifndef KC_MEMALLOC_NOFS_SAVE
|
||||
#define memalloc_nofs_save memalloc_noio_save
|
||||
#define memalloc_nofs_restore memalloc_noio_restore
|
||||
#endif
|
||||
|
||||
#ifdef KC_BIO_BI_OPF
|
||||
#define kc_bio_get_opf(bio) \
|
||||
({ \
|
||||
(bio)->bi_opf; \
|
||||
})
|
||||
#define kc_bio_set_opf(bio, opf) \
|
||||
do { \
|
||||
(bio)->bi_opf = opf; \
|
||||
} while (0)
|
||||
#define kc_bio_set_sector(bio, sect) \
|
||||
do { \
|
||||
(bio)->bi_iter.bi_sector = sect;\
|
||||
} while (0)
|
||||
#else
|
||||
#define kc_bio_get_opf(bio) \
|
||||
({ \
|
||||
(bio)->bi_rw; \
|
||||
})
|
||||
#define kc_bio_set_opf(bio, opf) \
|
||||
do { \
|
||||
(bio)->bio_rw = opf; \
|
||||
} while (0)
|
||||
#define kc_bio_set_sector(bio, sect) \
|
||||
do { \
|
||||
(bio)->bi_sector = sect; \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#ifdef KC_BIO_BI_STATUS
|
||||
#define KC_DECLARE_BIO_END_IO(name, bio) name(bio)
|
||||
#define kc_bio_get_errno(bio) ({ blk_status_to_errno((bio)->bi_status); })
|
||||
#else
|
||||
#define KC_DECLARE_BIO_END_IO(name, bio) name(bio, int _error_arg)
|
||||
#define kc_bio_get_errno(bio) ({ (int)((void)(bio), _error_arg); })
|
||||
#endif
|
||||
|
||||
#ifndef KC_SHRINKER_SHRINK
|
||||
#define KC_DEFINE_SHRINKER(name) struct shrinker name
|
||||
#define KC_INIT_SHRINKER_FUNCS(type, name, shrink, count, scan) do { \
|
||||
__typeof__(shrink) _shrink = (shrink); \
|
||||
_shrink->count_objects = count; \
|
||||
_shrink->scan_objects = scan; \
|
||||
} while (0)
|
||||
#else
|
||||
#include <linux/shrinker.h>
|
||||
struct kc_shrinker_funcs {
|
||||
unsigned long (*count_objects)(struct shrinker *, struct shrink_control *sc);
|
||||
unsigned long (*scan_objects)(struct shrinker *, struct shrink_control *sc);
|
||||
};
|
||||
/* using adjacent member of an unnamed struct */
|
||||
#define KC_DEFINE_SHRINKER(name) \
|
||||
{ \
|
||||
struct kc_shrinker_funcs shrinker_funcs; \
|
||||
struct shinker name; \
|
||||
}
|
||||
#define KC_SHRINKER_FUNCS(shrinker) \
|
||||
((void *)((long)(shrink) - sizeof(struct kc_shrinker_funcs)))
|
||||
#define KC_INIT_SHRINKER_FUNCS(type, name, shrink, count, scan) do { \
|
||||
BUILD_BUG_ON(offsetof(cont, shrink_funcs) + sizeof(struct kc_shrinker_funcs)) != \
|
||||
offsetof(cont, name) + sizeof(struct kc_shrinker_funcs); \
|
||||
struct kc_shrinker_funcs *_funcs = KC_SHRINKER_FUNCS(shrink) \
|
||||
__typeof__(shrink) _shrink = (shrink); \
|
||||
_funcs->count_objects = count; \
|
||||
_funcs->scan_objects = scan; \
|
||||
_shrink->shrink = kc_shrink_wrapper; \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -694,13 +694,13 @@ static int alloc_move_refill_zoned(struct super_block *sb, struct scoutfs_alloc_
|
||||
|
||||
static int alloc_move_empty(struct super_block *sb,
|
||||
struct scoutfs_alloc_root *dst,
|
||||
struct scoutfs_alloc_root *src, u64 meta_reserved)
|
||||
struct scoutfs_alloc_root *src, u64 meta_budget)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
|
||||
return scoutfs_alloc_move(sb, &server->alloc, &server->wri,
|
||||
dst, src, le64_to_cpu(src->total_len), NULL, NULL, 0,
|
||||
meta_reserved);
|
||||
meta_budget);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1226,6 +1226,82 @@ static int finalize_and_start_log_merge(struct super_block *sb, struct scoutfs_l
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* The calling get_log_trees ran out of available blocks in its commit's
|
||||
* metadata allocator while moving extents from the log tree's
|
||||
* data_freed into the core data_avail. This finishes moving the
|
||||
* extents in as many additional commits as it takes. The logs mutex
|
||||
* is nested inside holding commits so we recheck the persistent item
|
||||
* each time we commit to make sure it's still what we think. The
|
||||
* caller is still going to send the item to the client so we update the
|
||||
* caller's each time we make progress. This is a best-effort attempt
|
||||
* to clean up and it's valid to leave extents in data_freed we don't
|
||||
* return errors to the caller. The client will continue the work later
|
||||
* in get_log_trees or as the rid is reclaimed.
|
||||
*/
|
||||
static void try_drain_data_freed(struct super_block *sb, struct scoutfs_log_trees *lt)
|
||||
{
|
||||
DECLARE_SERVER_INFO(sb, server);
|
||||
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
|
||||
const u64 rid = le64_to_cpu(lt->rid);
|
||||
const u64 nr = le64_to_cpu(lt->nr);
|
||||
struct scoutfs_log_trees drain;
|
||||
struct scoutfs_key key;
|
||||
COMMIT_HOLD(hold);
|
||||
int ret = 0;
|
||||
int err;
|
||||
|
||||
scoutfs_key_init_log_trees(&key, rid, nr);
|
||||
|
||||
while (lt->data_freed.total_len != 0) {
|
||||
server_hold_commit(sb, &hold);
|
||||
mutex_lock(&server->logs_mutex);
|
||||
|
||||
ret = find_log_trees_item(sb, &super->logs_root, false, rid, U64_MAX, &drain);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
/* careful to only keep draining the caller's specific open trans */
|
||||
if (drain.nr != lt->nr || drain.get_trans_seq != lt->get_trans_seq ||
|
||||
drain.commit_trans_seq != lt->commit_trans_seq || drain.flags != lt->flags) {
|
||||
ret = -ENOENT;
|
||||
break;
|
||||
}
|
||||
|
||||
ret = scoutfs_btree_dirty(sb, &server->alloc, &server->wri,
|
||||
&super->logs_root, &key);
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
/* moving can modify and return errors, always update caller and item */
|
||||
mutex_lock(&server->alloc_mutex);
|
||||
ret = alloc_move_empty(sb, &super->data_alloc, &drain.data_freed,
|
||||
COMMIT_HOLD_ALLOC_BUDGET / 2);
|
||||
mutex_unlock(&server->alloc_mutex);
|
||||
if (ret == -EINPROGRESS)
|
||||
ret = 0;
|
||||
|
||||
*lt = drain;
|
||||
err = scoutfs_btree_force(sb, &server->alloc, &server->wri,
|
||||
&super->logs_root, &key, &drain, sizeof(drain));
|
||||
BUG_ON(err < 0); /* dirtying must guarantee success */
|
||||
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
|
||||
ret = server_apply_commit(sb, &hold, ret);
|
||||
if (ret < 0) {
|
||||
ret = 0; /* don't try to abort, ignoring ret */
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* try to cleanly abort and write any partial dirty btree blocks, but ignore result */
|
||||
if (ret < 0) {
|
||||
mutex_unlock(&server->logs_mutex);
|
||||
server_apply_commit(sb, &hold, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Give the client roots to all the trees that they'll use to build
|
||||
* their transaction.
|
||||
@@ -1351,7 +1427,9 @@ static int server_get_log_trees(struct super_block *sb,
|
||||
goto update;
|
||||
}
|
||||
|
||||
ret = alloc_move_empty(sb, &super->data_alloc, <.data_freed, 0);
|
||||
ret = alloc_move_empty(sb, &super->data_alloc, <.data_freed, 100);
|
||||
if (ret == -EINPROGRESS)
|
||||
ret = 0;
|
||||
if (ret < 0) {
|
||||
err_str = "emptying committed data_freed";
|
||||
goto update;
|
||||
@@ -1429,6 +1507,10 @@ out:
|
||||
scoutfs_err(sb, "error %d getting log trees for rid %016llx: %s",
|
||||
ret, rid, err_str);
|
||||
|
||||
/* try to drain excessive data_freed with additional commits, if needed, ignoring err */
|
||||
if (ret == 0)
|
||||
try_drain_data_freed(sb, <);
|
||||
|
||||
return scoutfs_net_response(sb, conn, cmd, id, ret, <, sizeof(lt));
|
||||
}
|
||||
|
||||
|
||||
@@ -10,7 +10,8 @@ BIN := src/createmany \
|
||||
src/bulk_create_paths \
|
||||
src/stage_tmpfile \
|
||||
src/find_xattrs \
|
||||
src/create_xattr_loop
|
||||
src/create_xattr_loop \
|
||||
src/fragmented_data_extents
|
||||
|
||||
DEPS := $(wildcard src/*.d)
|
||||
|
||||
|
||||
3
tests/golden/large-fragmented-free
Normal file
3
tests/golden/large-fragmented-free
Normal file
@@ -0,0 +1,3 @@
|
||||
== creating fragmented extents
|
||||
== unlink file with moved extents to free extents per block
|
||||
== cleanup
|
||||
@@ -9,6 +9,7 @@ fallocate.sh
|
||||
setattr_more.sh
|
||||
offline-extent-waiting.sh
|
||||
move-blocks.sh
|
||||
large-fragmented-free.sh
|
||||
enospc.sh
|
||||
srch-basic-functionality.sh
|
||||
simple-xattr-unit.sh
|
||||
|
||||
113
tests/src/fragmented_data_extents.c
Normal file
113
tests/src/fragmented_data_extents.c
Normal file
@@ -0,0 +1,113 @@
|
||||
/*
|
||||
* Copyright (C) 2021 Versity Software, Inc. All rights reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU General Public
|
||||
* License v2 as published by the Free Software Foundation.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* General Public License for more details.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This creates fragmented data extents.
|
||||
*
|
||||
* A file is created that has alternating free and allocated extents.
|
||||
* This also results in the global allocator having the matching
|
||||
* fragmented free extent pattern. While that file is being created,
|
||||
* occasionally an allocated extent is moved to another file. This
|
||||
* results in a file that has fragmented extents at a given stride that
|
||||
* can be deleted to create free data extents with a given stride.
|
||||
*
|
||||
* We don't have hole punching so to do this quickly we use a goofy
|
||||
* combination of fallocate, truncate, and our move_blocks ioctl.
|
||||
*/
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include <unistd.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
#include <linux/types.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "ioctl.h"
|
||||
|
||||
#define BLOCK_SIZE 4096
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
struct scoutfs_ioctl_move_blocks mb = {0,};
|
||||
unsigned long long freed_extents;
|
||||
unsigned long long move_stride;
|
||||
unsigned long long i;
|
||||
int alloc_fd;
|
||||
int trunc_fd;
|
||||
off_t off;
|
||||
int ret;
|
||||
|
||||
if (argc != 5) {
|
||||
printf("%s <freed_extents> <move_stride> <alloc_file> <trunc_file>\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
freed_extents = strtoull(argv[1], NULL, 0);
|
||||
move_stride = strtoull(argv[2], NULL, 0);
|
||||
|
||||
alloc_fd = open(argv[3], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
|
||||
if (alloc_fd == -1) {
|
||||
fprintf(stderr, "error opening %s: %d (%s)\n", argv[3], errno, strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
trunc_fd = open(argv[4], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR);
|
||||
if (trunc_fd == -1) {
|
||||
fprintf(stderr, "error opening %s: %d (%s)\n", argv[4], errno, strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
for (i = 0, off = 0; i < freed_extents; i++, off += BLOCK_SIZE * 2) {
|
||||
|
||||
ret = fallocate(alloc_fd, 0, off, BLOCK_SIZE * 2);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "fallocate at off %llu error: %d (%s)\n",
|
||||
(unsigned long long)off, errno, strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
ret = ftruncate(alloc_fd, off + BLOCK_SIZE);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "truncate to off %llu error: %d (%s)\n",
|
||||
(unsigned long long)off + BLOCK_SIZE, errno, strerror(errno));
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if ((i % move_stride) == 0) {
|
||||
mb.from_fd = alloc_fd;
|
||||
mb.from_off = off;
|
||||
mb.len = BLOCK_SIZE;
|
||||
mb.to_off = i * BLOCK_SIZE;
|
||||
|
||||
ret = ioctl(trunc_fd, SCOUTFS_IOC_MOVE_BLOCKS, &mb);
|
||||
if (ret < 0) {
|
||||
fprintf(stderr, "move from off %llu error: %d (%s)\n",
|
||||
(unsigned long long)off,
|
||||
errno, strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (alloc_fd > -1)
|
||||
close(alloc_fd);
|
||||
if (trunc_fd > -1)
|
||||
close(trunc_fd);
|
||||
|
||||
return 0;
|
||||
}
|
||||
22
tests/tests/large-fragmented-free.sh
Normal file
22
tests/tests/large-fragmented-free.sh
Normal file
@@ -0,0 +1,22 @@
|
||||
#
|
||||
# Make sure the server can handle a transaction with a data_freed whose
|
||||
# blocks all hit different btree blocks in the main free list. It
|
||||
# probably has to be merged in multiple commits.
|
||||
#
|
||||
|
||||
t_require_commands fragmented_data_extents
|
||||
|
||||
EXTENTS_PER_BTREE_BLOCK=600
|
||||
EXTENTS_PER_LIST_BLOCK=8192
|
||||
FREED_EXTENTS=$((EXTENTS_PER_BTREE_BLOCK * EXTENTS_PER_LIST_BLOCK))
|
||||
|
||||
echo "== creating fragmented extents"
|
||||
fragmented_data_extents $FREED_EXTENTS $EXTENTS_PER_BTREE_BLOCK "$T_D0/alloc" "$T_D0/move"
|
||||
|
||||
echo "== unlink file with moved extents to free extents per block"
|
||||
rm -f "$T_D0/move"
|
||||
|
||||
echo "== cleanup"
|
||||
rm -f "$T_D0/alloc"
|
||||
|
||||
t_pass
|
||||
Reference in New Issue
Block a user