diff --git a/kmod/src/alloc.h b/kmod/src/alloc.h index 4496802c..70d39c5e 100644 --- a/kmod/src/alloc.h +++ b/kmod/src/alloc.h @@ -19,14 +19,11 @@ (128ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT) /* - * The largest aligned region that we'll try to allocate at the end of - * the file as it's extended. This is also limited to the current file - * size so we can only waste at most twice the total file size when - * files are less than this. We try to keep this around the point of - * diminishing returns in streaming performance of common data devices - * to limit waste. + * The default size that we'll try to preallocate. This is trying to + * hit the limit of large efficient device writes while minimizing + * wasted preallocation that is never used. */ -#define SCOUTFS_DATA_EXTEND_PREALLOC_LIMIT \ +#define SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS \ (8ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SM_SHIFT) /* diff --git a/kmod/src/data.c b/kmod/src/data.c index 2c0822db..402e50b6 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -366,27 +366,27 @@ static inline u64 ext_last(struct scoutfs_extent *ext) /* * The caller is writing to a logical iblock that doesn't have an - * allocated extent. + * allocated extent. The caller has searched for an extent containing + * iblock. If it already existed then it must be unallocated and + * offline. * - * We always allocate an extent starting at the logical iblock. The - * caller has searched for an extent containing iblock. If it already - * existed then it must be unallocated and offline. + * We implement two preallocation strategies. Typically we only + * preallocate for simple streaming writes and limit preallocation while + * the file is small. The largest efficient allocation size is + * typically large enough that it would be unreasonable to allocate that + * much for all small files. * - * Preallocation is used if we're strictly contiguously extending - * writes. That is, if the logical block offset equals the number of - * online blocks. We try to preallocate the number of blocks existing - * so that small files don't waste inordinate amounts of space and large - * files will eventually see large extents. This only works for - * contiguous single stream writes or stages of files from the first - * block. It doesn't work for concurrent stages, releasing behind - * staging, sparse files, multi-node writes, etc. fallocate() is always - * a better tool to use. + * Optionally, we can simply preallocate large empty aligned regions. + * This can waste a lot of space for small or sparse files but is + * reasonable when a file population is known to be large and dense but + * known to be written with non-streaming write patterns. */ static int alloc_block(struct super_block *sb, struct inode *inode, struct scoutfs_extent *ext, u64 iblock, struct scoutfs_lock *lock) { DECLARE_DATA_INFO(sb, datinf); + struct scoutfs_mount_options opts; const u64 ino = scoutfs_ino(inode); struct data_ext_args args = { .ino = ino, @@ -394,17 +394,22 @@ static int alloc_block(struct super_block *sb, struct inode *inode, .lock = lock, }; struct scoutfs_extent found; - struct scoutfs_extent pre; + struct scoutfs_extent pre = {0,}; + bool undo_pre = false; u64 blkno = 0; u64 online; u64 offline; u8 flags; + u64 start; u64 count; + u64 rem; int ret; int err; trace_scoutfs_data_alloc_block_enter(sb, ino, iblock, ext); + scoutfs_options_read(sb, &opts); + /* can only allocate over existing unallocated offline extent */ if (WARN_ON_ONCE(ext->len && !(iblock >= ext->start && iblock <= ext_last(ext) && @@ -413,66 +418,106 @@ static int alloc_block(struct super_block *sb, struct inode *inode, mutex_lock(&datinf->mutex); - scoutfs_inode_get_onoff(inode, &online, &offline); + /* default to single allocation at the written block */ + start = iblock; + count = 1; + /* copy existing flags for preallocated regions */ + flags = ext->len ? ext->flags : 0; if (ext->len) { - /* limit preallocation to remaining existing (offline) extent */ + /* + * Assume that offline writers are going to be writing + * all the offline extents and try to preallocate the + * rest of the unwritten extent. + */ count = ext->len - (iblock - ext->start); - flags = ext->flags; + + } else if (opts.data_prealloc_contig_only) { + /* + * Only preallocate when a quick test of the online + * block counts looks like we're a simple streaming + * write. Try to write until the next extent but limit + * the preallocation size to the number of online + * blocks. + */ + scoutfs_inode_get_onoff(inode, &online, &offline); + if (iblock > 1 && iblock == online) { + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, + iblock, 1, &found); + if (ret < 0 && ret != -ENOENT) + goto out; + if (found.len && found.start > iblock) + count = found.start - iblock; + else + count = opts.data_prealloc_blocks; + + count = min(iblock, count); + } + } else { - /* otherwise alloc to next extent */ - ret = scoutfs_ext_next(sb, &data_ext_ops, &args, - iblock, 1, &found); + /* + * Preallocation of aligned regions only preallocates if + * the aligned region contains no extents at all. This + * could be fooled by offline sparse extents but we + * don't want to iterate over all offline extents in the + * aligned region. + */ + div64_u64_rem(iblock, opts.data_prealloc_blocks, &rem); + start = iblock - rem; + count = opts.data_prealloc_blocks; + ret = scoutfs_ext_next(sb, &data_ext_ops, &args, start, 1, &found); if (ret < 0 && ret != -ENOENT) goto out; - if (found.len && found.start > iblock) - count = found.start - iblock; - else - count = SCOUTFS_DATA_EXTEND_PREALLOC_LIMIT; - flags = 0; + if (found.len && found.start < start + count) + count = 1; } /* overall prealloc limit */ - count = min_t(u64, count, SCOUTFS_DATA_EXTEND_PREALLOC_LIMIT); - - /* only strictly contiguous extending writes will try to preallocate */ - if (iblock > 1 && iblock == online) - count = min(iblock, count); - else - count = 1; + count = min_t(u64, count, opts.data_prealloc_blocks); ret = scoutfs_alloc_data(sb, datinf->alloc, datinf->wri, &datinf->dalloc, count, &blkno, &count); if (ret < 0) goto out; - ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, 1, blkno, 0); - if (ret < 0) - goto out; + /* + * An aligned prealloc attempt that gets a smaller extent can + * fail to cover iblock, make sure that it does. This is a + * pathological case so we don't try to move the window past + * iblock. Just enough to cover it, which we know is safe. + */ + if (start + count <= iblock) + start += (iblock - (start + count) + 1); if (count > 1) { - pre.start = iblock + 1; - pre.len = count - 1; - pre.map = blkno + 1; + pre.start = start; + pre.len = count; + pre.map = blkno; pre.flags = flags | SEF_UNWRITTEN; ret = scoutfs_ext_set(sb, &data_ext_ops, &args, pre.start, pre.len, pre.map, pre.flags); - if (ret < 0) { - err = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, - 1, 0, flags); - BUG_ON(err); /* couldn't restore original */ + if (ret < 0) goto out; - } + undo_pre = true; } + ret = scoutfs_ext_set(sb, &data_ext_ops, &args, iblock, 1, blkno + (iblock - start), 0); + if (ret < 0) + goto out; + /* tell the caller we have a single block, could check next? */ ext->start = iblock; ext->len = 1; - ext->map = blkno; + ext->map = blkno + (iblock - start); ext->flags = 0; ret = 0; out: if (ret < 0 && blkno > 0) { + if (undo_pre) { + err = scoutfs_ext_set(sb, &data_ext_ops, &args, + pre.start, pre.len, 0, flags); + BUG_ON(err); /* leaked preallocated extent */ + } err = scoutfs_free_data(sb, datinf->alloc, datinf->wri, &datinf->data_freed, blkno, count); BUG_ON(err); /* leaked free blocks */ diff --git a/kmod/src/options.c b/kmod/src/options.c index 474d2b0d..e28bf2a6 100644 --- a/kmod/src/options.c +++ b/kmod/src/options.c @@ -27,9 +27,12 @@ #include "options.h" #include "super.h" #include "inode.h" +#include "alloc.h" enum { Opt_acl, + Opt_data_prealloc_blocks, + Opt_data_prealloc_contig_only, Opt_metadev_path, Opt_noacl, Opt_orphan_scan_delay_ms, @@ -39,6 +42,8 @@ enum { static const match_table_t tokens = { {Opt_acl, "acl"}, + {Opt_data_prealloc_blocks, "data_prealloc_blocks=%s"}, + {Opt_data_prealloc_contig_only, "data_prealloc_contig_only=%s"}, {Opt_metadev_path, "metadev_path=%s"}, {Opt_noacl, "noacl"}, {Opt_orphan_scan_delay_ms, "orphan_scan_delay_ms=%s"}, @@ -110,9 +115,15 @@ static void free_options(struct scoutfs_mount_options *opts) #define DEFAULT_ORPHAN_SCAN_DELAY_MS (10 * MSEC_PER_SEC) #define MAX_ORPHAN_SCAN_DELAY_MS (60 * MSEC_PER_SEC) +#define MIN_DATA_PREALLOC_BLOCKS 1ULL +#define MAX_DATA_PREALLOC_BLOCKS ((unsigned long long)SCOUTFS_BLOCK_SM_MAX) + static void init_default_options(struct scoutfs_mount_options *opts) { memset(opts, 0, sizeof(*opts)); + + opts->data_prealloc_blocks = SCOUTFS_DATA_PREALLOC_DEFAULT_BLOCKS; + opts->data_prealloc_contig_only = 1; opts->quorum_slot_nr = -1; opts->orphan_scan_delay_ms = -1; } @@ -126,6 +137,7 @@ static void init_default_options(struct scoutfs_mount_options *opts) static int parse_options(struct super_block *sb, char *options, struct scoutfs_mount_options *opts) { substring_t args[MAX_OPT_ARGS]; + u64 nr64; int nr; int token; char *p; @@ -142,6 +154,30 @@ static int parse_options(struct super_block *sb, char *options, struct scoutfs_m sb->s_flags |= MS_POSIXACL; break; + case Opt_data_prealloc_blocks: + ret = match_u64(args, &nr64); + if (ret < 0 || + nr64 < MIN_DATA_PREALLOC_BLOCKS || nr64 > MAX_DATA_PREALLOC_BLOCKS) { + scoutfs_err(sb, "invalid data_prealloc_blocks option, must be between %llu and %llu", + MIN_DATA_PREALLOC_BLOCKS, MAX_DATA_PREALLOC_BLOCKS); + if (ret == 0) + ret = -EINVAL; + return ret; + } + opts->data_prealloc_blocks = nr64; + break; + + case Opt_data_prealloc_contig_only: + ret = match_int(args, &nr); + if (ret < 0 || nr < 0 || nr > 1) { + scoutfs_err(sb, "invalid data_prealloc_contig_only option, bool must only be 0 or 1"); + if (ret == 0) + ret = -EINVAL; + return ret; + } + opts->data_prealloc_contig_only = nr; + break; + case Opt_metadev_path: ret = parse_bdev_path(sb, &args[0], &opts->metadev_path); if (ret < 0) @@ -271,6 +307,8 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root) if (is_acl) seq_puts(seq, ",acl"); + seq_printf(seq, ",data_prealloc_blocks=%llu", opts.data_prealloc_blocks); + seq_printf(seq, ",data_prealloc_contig_only=%u", opts.data_prealloc_contig_only); seq_printf(seq, ",metadev_path=%s", opts.metadev_path); if (!is_acl) seq_puts(seq, ",noacl"); @@ -281,6 +319,83 @@ int scoutfs_options_show(struct seq_file *seq, struct dentry *root) return 0; } +static ssize_t data_prealloc_blocks_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + struct scoutfs_mount_options opts; + + scoutfs_options_read(sb, &opts); + + return snprintf(buf, PAGE_SIZE, "%llu", opts.data_prealloc_blocks); +} +static ssize_t data_prealloc_blocks_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + DECLARE_OPTIONS_INFO(sb, optinf); + char nullterm[30]; /* more than enough for octal -U64_MAX */ + u64 val; + int len; + int ret; + + len = min(count, sizeof(nullterm) - 1); + memcpy(nullterm, buf, len); + nullterm[len] = '\0'; + + ret = kstrtoll(nullterm, 0, &val); + if (ret < 0 || val < MIN_DATA_PREALLOC_BLOCKS || val > MAX_DATA_PREALLOC_BLOCKS) { + scoutfs_err(sb, "invalid data_prealloc_blocks option, must be between %llu and %llu", + MIN_DATA_PREALLOC_BLOCKS, MAX_DATA_PREALLOC_BLOCKS); + return -EINVAL; + } + + write_seqlock(&optinf->seqlock); + optinf->opts.data_prealloc_blocks = val; + write_sequnlock(&optinf->seqlock); + + return count; +} +SCOUTFS_ATTR_RW(data_prealloc_blocks); + +static ssize_t data_prealloc_contig_only_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + struct scoutfs_mount_options opts; + + scoutfs_options_read(sb, &opts); + + return snprintf(buf, PAGE_SIZE, "%u", opts.data_prealloc_contig_only); +} +static ssize_t data_prealloc_contig_only_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); + DECLARE_OPTIONS_INFO(sb, optinf); + char nullterm[20]; /* more than enough for octal -U32_MAX */ + long val; + int len; + int ret; + + len = min(count, sizeof(nullterm) - 1); + memcpy(nullterm, buf, len); + nullterm[len] = '\0'; + + ret = kstrtol(nullterm, 0, &val); + if (ret < 0 || val < 0 || val > 1) { + scoutfs_err(sb, "invalid data_prealloc_contig_only option, bool must be 0 or 1"); + return -EINVAL; + } + + write_seqlock(&optinf->seqlock); + optinf->opts.data_prealloc_contig_only = val; + write_sequnlock(&optinf->seqlock); + + return count; +} +SCOUTFS_ATTR_RW(data_prealloc_contig_only); + static ssize_t metadev_path_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct super_block *sb = SCOUTFS_SYSFS_ATTRS_SB(kobj); @@ -345,6 +460,8 @@ static ssize_t quorum_slot_nr_show(struct kobject *kobj, struct kobj_attribute * SCOUTFS_ATTR_RO(quorum_slot_nr); static struct attribute *options_attrs[] = { + SCOUTFS_ATTR_PTR(data_prealloc_blocks), + SCOUTFS_ATTR_PTR(data_prealloc_contig_only), SCOUTFS_ATTR_PTR(metadev_path), SCOUTFS_ATTR_PTR(orphan_scan_delay_ms), SCOUTFS_ATTR_PTR(quorum_slot_nr), diff --git a/kmod/src/options.h b/kmod/src/options.h index 26d1eb1e..d2e4ad74 100644 --- a/kmod/src/options.h +++ b/kmod/src/options.h @@ -6,6 +6,8 @@ #include "format.h" struct scoutfs_mount_options { + u64 data_prealloc_blocks; + bool data_prealloc_contig_only; char *metadev_path; unsigned int orphan_scan_delay_ms; int quorum_slot_nr; diff --git a/utils/man/scoutfs.5 b/utils/man/scoutfs.5 index 8b20483b..8f619e4b 100644 --- a/utils/man/scoutfs.5 +++ b/utils/man/scoutfs.5 @@ -21,6 +21,40 @@ as detailed in .BR acl (5) . Support for POSIX ACLs is the default. .TP +.B data_prealloc_blocks= +Set the size of preallocation regions of data files, in 4KiB blocks. +Writes to these regions that contain no extents will attempt to +preallocate the size of the full region. This can waste a lot of space +with small files, files with sparse regions, and files whose final +length isn't a multiple of the preallocation size. The following +data_prealloc_contig_only option, which is the default, restricts this +behaviour to waste less space. +.sp +All the preallocation options can be changed in an active mount by +writing to their respective files in the options directory in the +mount's sysfs directory. +.sp +It is worth noting that it is always more efficient in every way to use +.BR fallocate (2) +to precisely allocate large extents for the resulting size of the file. +Always attempt to enable it in software that supports it. +.TP +.B data_prealloc_contig_only=<0|1> +This option, currently the default, limits file data preallocation in +two ways. First, it will only preallocate when extending a fully +allocated file. Second, it will limit the size of preallocation to the +existing length of the file. These limits reduce the amount of +preallocation wasted per file at the cost of multiple initial extents in +all files. It only supports simple streaming writes, any other write +pattern will not be recognized and could result in many fragmented +extent allocations. +.sp +This option can be disabled to encourage large allocated extents +regardless of write patterns. This can be helpful if files are written +with initial sparse regions (perhaps by multiple threads writing to +different regions) and wasted space isn't an issue (perhaps because the +file population contains few small files). +.TP .B metadev_path= The metadev_path option specifies the path to the block device that contains the filesystem's metadata.