diff --git a/kmod/src/client.c b/kmod/src/client.c index c3c0bb91..fe837420 100644 --- a/kmod/src/client.c +++ b/kmod/src/client.c @@ -555,20 +555,28 @@ int scoutfs_client_alloc_inodes(struct super_block *sb, u64 count, return ret; } -int scoutfs_client_alloc_extent(struct super_block *sb, u64 len, u64 *start) +/* + * Ask the server for an extent of at most @blocks blocks. It can return + * smaller extents. + */ +int scoutfs_client_alloc_extent(struct super_block *sb, u64 blocks, u64 *start, + u64 *len) + { struct client_info *client = SCOUTFS_SB(sb)->client_info; - __le64 lelen = cpu_to_le64(len); - __le64 lestart; + __le64 leblocks = cpu_to_le64(blocks); + struct scoutfs_net_extent nex; int ret; ret = client_request(client, SCOUTFS_NET_ALLOC_EXTENT, - &lelen, sizeof(lelen), &lestart, sizeof(lestart)); + &leblocks, sizeof(leblocks), &nex, sizeof(nex)); if (ret == 0) { - if (lestart == 0) + if (nex.len == 0) { ret = -ENOSPC; - else - *start = le64_to_cpu(lestart); + } else { + *start = le64_to_cpu(nex.start); + *len = le64_to_cpu(nex.len); + } } return ret; diff --git a/kmod/src/client.h b/kmod/src/client.h index aa098f02..259454dd 100644 --- a/kmod/src/client.h +++ b/kmod/src/client.h @@ -3,7 +3,8 @@ int scoutfs_client_alloc_inodes(struct super_block *sb, u64 count, u64 *ino, u64 *nr); -int scoutfs_client_alloc_extent(struct super_block *sb, u64 len, u64 *start); +int scoutfs_client_alloc_extent(struct super_block *sb, u64 blocks, u64 *start, + u64 *len); int scoutfs_client_alloc_segno(struct super_block *sb, u64 *segno); int scoutfs_client_record_segment(struct super_block *sb, struct scoutfs_segment *seg, u8 level); diff --git a/kmod/src/data.c b/kmod/src/data.c index 50196f37..96fa9b9f 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -58,6 +58,20 @@ * - need trans around each bulk alloc */ +/* + * The largest extent that we'll store in a single item. This will + * determine the granularity of interleaved concurrent allocations on a + * node. Sequential max length allocations could still see contiguous + * physical extent allocations. It limits the amount of IO needed to + * invalidate a lock. And it determines the granularity of parallel + * writes to a file between nodes. + */ +#define MAX_EXTENT_BLOCKS (8ULL * 1024 * 1024 >> SCOUTFS_BLOCK_SHIFT) +/* + * We ask for a fixed size from the server today. + */ +#define SERVER_ALLOC_BLOCKS (MAX_EXTENT_BLOCKS * 8) + struct data_info { struct rw_semaphore alloc_rwsem; }; @@ -399,14 +413,16 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, return ret; } -static int get_server_extent(struct super_block *sb, u64 len) +static int get_server_extent(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_extent ext; u64 start; + u64 len; int ret; - ret = scoutfs_client_alloc_extent(sb, len, &start); + ret = scoutfs_client_alloc_extent(sb, SERVER_ALLOC_BLOCKS, + &start, &len); if (ret) goto out; @@ -420,6 +436,57 @@ out: return ret; } +/* + * Find a free extent to satisfy an allocation of at most @len blocks. + * + * Returns 0 and fills the caller's extent with a _BLKNO_TYPE extent if + * we found a match. It's len may be less than desired. No stored + * extents have been modified. + * + * Returns -errno on error and -ENOSPC if no free extents were found. + * + * The caller's extent is always clobbered. + */ +static int find_free_extent(struct super_block *sb, u64 len, + struct scoutfs_extent *ext) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + int ret; + + len = min(len, MAX_EXTENT_BLOCKS); + + for (;;) { + /* first try to find the first sufficient extent */ + scoutfs_extent_init(ext, SCOUTFS_FREE_EXTENT_BLOCKS_TYPE, + sbi->node_id, 0, len, 0, 0); + ret = scoutfs_extent_next(sb, data_extent_io, ext, + sbi->node_id_lock); + + /* if none big enough, look for last largest smaller */ + if (ret == -ENOENT && len > 1) + ret = scoutfs_extent_prev(sb, data_extent_io, ext, + sbi->node_id_lock); + + /* ask the server for more if we think it'll help */ + if (ret == -ENOENT || ext->len < len) { + ret = get_server_extent(sb); + if (ret == 0) + continue; + } + + /* use the extent we found or return errors */ + break; + } + + if (ret == 0) + scoutfs_extent_init(ext, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, + sbi->node_id, ext->start, + min(ext->len, len), 0, 0); + + trace_scoutfs_data_find_free_extent(sb, ext); + return ret; +} + /* * The caller is writing to a logical block that doesn't have an * allocated extent. @@ -448,8 +515,6 @@ out: * On success we update the caller's extent to the single block * allocated extent for the logical block for use in block mapping. */ -#define MAX_STREAMING_PREALLOC_BLOCKS ((u64)SCOUTFS_SEGMENT_BLOCKS) -#define SERVER_ALLOC_BLOCKS (MAX_STREAMING_PREALLOC_BLOCKS * 32) static int alloc_block(struct super_block *sb, struct inode *inode, struct scoutfs_extent *ext, u64 iblock, u64 len, struct scoutfs_lock *lock) @@ -474,30 +539,16 @@ static int alloc_block(struct super_block *sb, struct inode *inode, /* strictly contiguous extending writes will try to preallocate */ if (iblock > 1 && iblock == online) - len = min3(len, iblock, MAX_STREAMING_PREALLOC_BLOCKS); + len = min3(len, iblock, MAX_EXTENT_BLOCKS); else len = 1; trace_scoutfs_data_alloc_block(sb, inode, ext, iblock, len, online, offline); - scoutfs_extent_init(&fr, SCOUTFS_FREE_EXTENT_BLOCKS_TYPE, - sbi->node_id, 0, len, 0, 0); - ret = scoutfs_extent_next(sb, data_extent_io, &fr, - sbi->node_id_lock); - if (ret == -ENOENT) { - /* try to get allocation from the server if we're out */ - ret = get_server_extent(sb, SERVER_ALLOC_BLOCKS); - if (ret == 0) - ret = scoutfs_extent_next(sb, data_extent_io, &fr, - sbi->node_id_lock); - } - if (ret) { - /* XXX should try to look for smaller free extents :/ */ - if (ret == -ENOENT) - ret = -ENOSPC; + ret = find_free_extent(sb, len, &fr); + if (ret < 0) goto out; - } trace_scoutfs_data_alloc_block_next(sb, &fr); @@ -505,9 +556,7 @@ static int alloc_block(struct super_block *sb, struct inode *inode, scoutfs_extent_init(&blk, SCOUTFS_FILE_EXTENT_TYPE, ino, iblock, 1, fr.start, 0); - /* remove the free extent we're using */ - scoutfs_extent_init(&fr, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, - sbi->node_id, fr.start, len, 0, 0); + /* remove the free extent that we're allocating */ ret = scoutfs_extent_remove(sb, data_extent_io, &fr, sbi->node_id_lock); if (ret) goto out; @@ -855,21 +904,14 @@ static int scoutfs_write_end(struct file *file, struct address_space *mapping, } /* - * Update one extent on behalf of fallocate. + * Allocate one extent on behalf of fallocate. The caller has given us + * the largest extent we can add, its flags, and the flags of an + * existing overlapping extent to remove. * - * The caller has searched for the next extent that intersects with the - * region including first_block and last_block. The next extent will be - * zeroed if it wasn't found. We don't know the state of the offsets - * past the next extent. - * - * The caller has held transactions and acquired locks. We only ever - * make one extent modification here. - * - * If this returns 0 then the caller's extent is clobbered. It is set - * to the newly fallocated extent so that the caller can continue with - * the fallocate operation. + * We allocate the largest extent that we can and return its length or + * -errno. */ -static int fallocate_one_extent(struct super_block *sb, u64 ino, u64 start, +static s64 fallocate_one_extent(struct super_block *sb, u64 ino, u64 start, u64 len, u8 flags, u8 rem_flags, struct scoutfs_lock *lock) { @@ -879,7 +921,7 @@ static int fallocate_one_extent(struct super_block *sb, u64 ino, u64 start, struct scoutfs_extent fr; bool add_rem = false; bool add_fr = false; - int ret; + s64 ret; if (WARN_ON_ONCE(len == 0) || WARN_ON_ONCE(start + len < start)) { @@ -887,28 +929,9 @@ static int fallocate_one_extent(struct super_block *sb, u64 ino, u64 start, goto out; } - /* find a sufficiently large free extent */ - scoutfs_extent_init(&fr, SCOUTFS_FREE_EXTENT_BLOCKS_TYPE, - sbi->node_id, 0, len, 0, 0); - ret = scoutfs_extent_next(sb, data_extent_io, &fr, - sbi->node_id_lock); - if (ret == -ENOENT) { - /* try to get allocation from the server if we're out */ - ret = get_server_extent(sb, SERVER_ALLOC_BLOCKS); - if (ret == 0) - ret = scoutfs_extent_next(sb, data_extent_io, &fr, - sbi->node_id_lock); - /* XXX try to find smaller free extents */ - } - if (ret < 0) { - if (ret == -ENOENT) - ret = -ENOSPC; + ret = find_free_extent(sb, len, &fr); + if (ret < 0) goto out; - } - - /* trim our allocation from the length indexed extent */ - scoutfs_extent_init(&fr, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, - sbi->node_id, fr.start, min(fr.len, len), 0, 0); ret = scoutfs_extent_init(&fal, SCOUTFS_FILE_EXTENT_TYPE, ino, start, fr.len, fr.start, flags); @@ -931,6 +954,8 @@ static int fallocate_one_extent(struct super_block *sb, u64 ino, u64 start, } ret = scoutfs_extent_add(sb, data_extent_io, &fal, lock); + if (ret == 0) + ret = fal.len; out: scoutfs_extent_cleanup(ret < 0 && add_rem, scoutfs_extent_add, sb, data_extent_io, &rem, lock, @@ -961,7 +986,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) LIST_HEAD(ind_locks); u64 last_block; u64 iblock; - u64 blocks; + s64 blocks; loff_t end; u8 rem_flags; u8 flags; @@ -1003,7 +1028,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) iblock = offset >> SCOUTFS_BLOCK_SHIFT; last_block = (offset + len - 1) >> SCOUTFS_BLOCK_SHIFT; - for (; iblock <= last_block; iblock = ext.start + ext.len) { + while(iblock <= last_block) { scoutfs_extent_init(&ext, SCOUTFS_FILE_EXTENT_TYPE, ino, iblock, 1, 0, 0); @@ -1020,17 +1045,19 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) } else if (iblock < ext.start) { /* sparse region until next extent */ - blocks = min(blocks, ext.start - iblock); + blocks = min_t(u64, blocks, ext.start - iblock); } else if (ext.map > 0) { /* skip past an allocated extent */ - blocks = min(blocks, (ext.start + ext.len) - iblock); + blocks = min_t(u64, blocks, + (ext.start + ext.len) - iblock); iblock += blocks; blocks = 0; } else { /* allocating a portion of an unallocated extent */ - blocks = min(blocks, (ext.start + ext.len) - iblock); + blocks = min_t(u64, blocks, + (ext.start + ext.len) - iblock); flags |= ext.flags; rem_flags = ext.flags; /* XXX corruption; why'd we store map == flags == 0? */ @@ -1047,9 +1074,13 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (blocks > 0) { down_write(&datinf->alloc_rwsem); - ret = fallocate_one_extent(sb, ino, iblock, blocks, - flags, rem_flags, lock); + blocks = fallocate_one_extent(sb, ino, iblock, blocks, + flags, rem_flags, lock); up_write(&datinf->alloc_rwsem); + if (blocks < 0) + ret = blocks; + else + ret = 0; } if (ret == 0 && !(mode & FALLOC_FL_KEEP_SIZE)) { @@ -1068,7 +1099,7 @@ long scoutfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) iblock += blocks; } - ret = 0; + out: scoutfs_unlock(sb, lock, DLM_LOCK_EX); mutex_unlock(&inode->i_mutex); diff --git a/kmod/src/format.h b/kmod/src/format.h index e3aa67d7..65c10b69 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -563,6 +563,11 @@ struct scoutfs_net_statfs { __u8 uuid[SCOUTFS_UUID_BYTES]; /* logical volume uuid */ } __packed; +struct scoutfs_net_extent { + __le64 start; + __le64 len; +} __packed; + /* XXX eventually we'll have net compaction and will need agents to agree */ /* one upper segment and fanout lower segments */ diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 87495177..9d32367b 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -2255,6 +2255,10 @@ DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_get_server_extent, TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), TP_ARGS(sb, ext) ); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_find_free_extent, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_alloc_block_next, TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), TP_ARGS(sb, ext) diff --git a/kmod/src/server.c b/kmod/src/server.c index d830d53e..ee52e8cc 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -218,7 +218,8 @@ static int server_extent_io(struct super_block *sb, int op, * free extent that can be very quickly allocated to a node. The hope is * that doesn't happen very often. */ -static int alloc_extent(struct super_block *sb, u64 len, u64 *start) +static int alloc_extent(struct super_block *sb, u64 blocks, + u64 *start, u64 *len) { struct server_info *server = SCOUTFS_SB(sb)->server_info; struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; @@ -226,17 +227,20 @@ static int alloc_extent(struct super_block *sb, u64 len, u64 *start) int ret; *start = 0; + *len = 0; down_write(&server->alloc_rwsem); - if (len & (SCOUTFS_SEGMENT_BLOCKS - 1)) { + if (blocks & (SCOUTFS_SEGMENT_BLOCKS - 1)) { ret = -EINVAL; goto out; } scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLOCKS_TYPE, 0, - 0, len, 0, 0); + 0, blocks, 0, 0); ret = scoutfs_extent_next(sb, server_extent_io, &ext, NULL); + if (ret == -ENOENT) + ret = scoutfs_extent_prev(sb, server_extent_io, &ext, NULL); if (ret) { if (ret == -ENOENT) ret = -ENOSPC; @@ -246,7 +250,7 @@ static int alloc_extent(struct super_block *sb, u64 len, u64 *start) trace_scoutfs_server_alloc_extent_next(sb, &ext); ext.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE; - ext.len = len; + ext.len = min(blocks, ext.len); ret = scoutfs_extent_remove(sb, server_extent_io, &ext, NULL); if (ret) @@ -256,6 +260,7 @@ static int alloc_extent(struct super_block *sb, u64 len, u64 *start) le64_add_cpu(&super->free_blocks, -ext.len); *start = ext.start; + *len = ext.len; ret = 0; out: @@ -705,26 +710,29 @@ static int process_alloc_extent(struct server_connection *conn, struct server_info *server = conn->server; struct super_block *sb = server->sb; struct commit_waiter cw; - __le64 lestart; - __le64 lelen; + struct scoutfs_net_extent nex; + __le64 leblocks; u64 start; + u64 len; int ret; - if (data_len != sizeof(lelen)) { + if (data_len != sizeof(leblocks)) { ret = -EINVAL; goto out; } - memcpy(&lelen, data, data_len); + memcpy(&leblocks, data, data_len); down_read(&server->commit_rwsem); - ret = alloc_extent(sb, le64_to_cpu(lelen), &start); + ret = alloc_extent(sb, le64_to_cpu(leblocks), &start, &len); if (ret == -ENOSPC) { start = 0; + len = 0; ret = 0; } if (ret == 0) { - lestart = cpu_to_le64(start); + nex.start = cpu_to_le64(start); + nex.len = cpu_to_le64(len); queue_commit_work(server, &cw); } up_read(&server->commit_rwsem); @@ -732,7 +740,7 @@ static int process_alloc_extent(struct server_connection *conn, if (ret == 0) ret = wait_for_commit(server, &cw, id, type); out: - return send_reply(conn, id, type, ret, &lestart, sizeof(lestart)); + return send_reply(conn, id, type, ret, &nex, sizeof(nex)); } /*