diff --git a/kmod/src/client.c b/kmod/src/client.c index 69b12248..fedaee8d 100644 --- a/kmod/src/client.c +++ b/kmod/src/client.c @@ -582,6 +582,18 @@ int scoutfs_client_alloc_extent(struct super_block *sb, u64 blocks, u64 *start, return ret; } +int scoutfs_client_free_extents(struct super_block *sb, + struct scoutfs_net_extent_list *nexl) +{ + struct client_info *client = SCOUTFS_SB(sb)->client_info; + unsigned int bytes; + + bytes = SCOUTFS_NET_EXTENT_LIST_BYTES(le64_to_cpu(nexl->nr)); + + return client_request(client, SCOUTFS_NET_FREE_EXTENTS, + nexl, bytes, NULL, 0); +} + int scoutfs_client_alloc_segno(struct super_block *sb, u64 *segno) { struct client_info *client = SCOUTFS_SB(sb)->client_info; diff --git a/kmod/src/client.h b/kmod/src/client.h index 259454dd..f0a8d609 100644 --- a/kmod/src/client.h +++ b/kmod/src/client.h @@ -5,6 +5,8 @@ int scoutfs_client_alloc_inodes(struct super_block *sb, u64 count, u64 *ino, u64 *nr); int scoutfs_client_alloc_extent(struct super_block *sb, u64 blocks, u64 *start, u64 *len); +int scoutfs_client_free_extents(struct super_block *sb, + struct scoutfs_net_extent_list *nexl); int scoutfs_client_alloc_segno(struct super_block *sb, u64 *segno); int scoutfs_client_record_segment(struct super_block *sb, struct scoutfs_segment *seg, u8 level); diff --git a/kmod/src/count.h b/kmod/src/count.h index 41817ea7..db35ebec 100644 --- a/kmod/src/count.h +++ b/kmod/src/count.h @@ -277,6 +277,21 @@ SIC_TRUNC_EXTENT(struct inode *inode) return cnt; } +/* + * Returning extents to the server can, at most: + * - delete MAX_NR extents with indexed copies + * - create an extent for the leftovers of the last extent + */ +static inline const struct scoutfs_item_count SIC_RETURN_EXTENTS(void) +{ + struct scoutfs_item_count cnt = {0,}; + unsigned int nr = SCOUTFS_NET_EXTENT_LIST_MAX_NR + 1; + + cnt.items += (nr * 2); + + return cnt; +} + /* * Fallocating an extent can, at most: * - allocate from the server: delete two free and insert merged diff --git a/kmod/src/data.c b/kmod/src/data.c index 07b90294..5485af0b 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "format.h" #include "super.h" @@ -38,6 +39,7 @@ #include "file.h" #include "extents.h" #include "msg.h" +#include "count.h" /* * scoutfs uses extent items to track file data block mappings and free @@ -72,9 +74,17 @@ * We ask for a fixed size from the server today. */ #define SERVER_ALLOC_BLOCKS (MAX_EXTENT_BLOCKS * 8) +/* + * Send free extents back to the server if we have plenty locally. + */ +#define NODE_FREE_HIGH_WATER_BLOCKS (SERVER_ALLOC_BLOCKS * 16) struct data_info { + struct super_block *sb; struct rw_semaphore alloc_rwsem; + atomic64_t node_free_blocks; + struct workqueue_struct *workq; + struct work_struct return_work; }; #define DECLARE_DATA_INFO(sb, name) \ @@ -148,10 +158,16 @@ static int init_extent_from_item(struct scoutfs_extent *ext, * We also index free extents by their length. We implement that by * keeping their _BLOCKS_ item in sync with the primary _BLKNO_ item * that callers operate on. + * + * The count of free blocks stored in node items is kept consistent by + * updating the count every time we create or delete items. Updated + * extents are deleted and then recreated so the count can bounce around + * a bit, but it's OK for it to be imprecise at the margins. */ static int data_extent_io(struct super_block *sb, int op, struct scoutfs_extent *ext, void *data) { + DECLARE_DATA_INFO(sb, datinf); struct scoutfs_lock *lock = data; struct scoutfs_file_extent fex; struct scoutfs_key first; @@ -230,6 +246,13 @@ static int data_extent_io(struct super_block *sb, int op, } } + if (ret == 0 && ext->type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE) { + if (op == SEI_INSERT) + atomic64_add(ext->len, &datinf->node_free_blocks); + else if (op == SEI_DELETE) + atomic64_sub(ext->len, &datinf->node_free_blocks); + } + return ret; } @@ -252,6 +275,7 @@ static s64 truncate_one_extent(struct super_block *sb, struct inode *inode, struct scoutfs_lock *lock) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + DECLARE_DATA_INFO(sb, datinf); struct scoutfs_extent next; struct scoutfs_extent rem; struct scoutfs_extent fr; @@ -324,6 +348,11 @@ static s64 truncate_one_extent(struct super_block *sb, struct inode *inode, scoutfs_inode_add_onoff(inode, online_delta, offline_delta); + /* start returning free extents to the server after a small delay */ + if (rem.map && (atomic64_read(&datinf->node_free_blocks) > + NODE_FREE_HIGH_WATER_BLOCKS)) + queue_work(datinf->workq, &datinf->return_work); + ret = 1; out: scoutfs_extent_cleanup(ret < 0 && add_rem, scoutfs_extent_add, sb, @@ -1238,6 +1267,94 @@ const struct file_operations scoutfs_file_fops = { .fallocate = scoutfs_fallocate, }; +/* + * Return extents to the server if we're over the high water mark. Each + * work call sends one batch of extents so that the work can be easily + * canceled to stop progress during unmount. + */ +static void scoutfs_data_return_server_extents_worker(struct work_struct *work) +{ + struct data_info *datinf = container_of(work, struct data_info, + return_work); + struct super_block *sb = datinf->sb; + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_net_extent_list *nexl; + struct scoutfs_extent ext; + u64 nr = 0; + u64 free; + int bytes; + int ret; + int err; + + trace_scoutfs_data_return_server_extents_enter(sb, 0, 0); + + bytes = SCOUTFS_NET_EXTENT_LIST_BYTES(SCOUTFS_NET_EXTENT_LIST_MAX_NR); + nexl = kmalloc(bytes, GFP_NOFS); + if (!nexl) { + ret = -ENOMEM; + goto out; + } + + ret = scoutfs_hold_trans(sb, SIC_RETURN_EXTENTS()); + if (ret) + goto out; + + down_write(&datinf->alloc_rwsem); + + free = atomic64_read(&datinf->node_free_blocks); + + while (nr < SCOUTFS_NET_EXTENT_LIST_MAX_NR && + free > NODE_FREE_HIGH_WATER_BLOCKS) { + + scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLOCKS_TYPE, + sbi->node_id, 0, 1, 0, 0); + ret = scoutfs_extent_next(sb, data_extent_io, &ext, + sbi->node_id_lock); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + break; + } + + trace_scoutfs_data_return_server_extent(sb, &ext); + + ext.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE; + ext.len = min(ext.len, free - NODE_FREE_HIGH_WATER_BLOCKS); + + ret = scoutfs_extent_remove(sb, data_extent_io, &ext, + sbi->node_id_lock); + if (ret) + break; + + nexl->extents[nr].start = cpu_to_le64(ext.start); + nexl->extents[nr].len = cpu_to_le64(ext.len); + + nr++; + free -= ext.len; + } + + nexl->nr = cpu_to_le64(nr); + + up_write(&datinf->alloc_rwsem); + + if (nr > 0) { + err = scoutfs_client_free_extents(sb, nexl); + /* XXX leaked extents if free failed */ + if (ret == 0 && err < 0) + ret = err; + } + + scoutfs_release_trans(sb); +out: + kfree(nexl); + + trace_scoutfs_data_return_server_extents_exit(sb, nr, ret); + + /* keep returning if we're still over the water mark */ + if (ret == 0 && (atomic64_read(&datinf->node_free_blocks) > + NODE_FREE_HIGH_WATER_BLOCKS)) + queue_work(datinf->workq, &datinf->return_work); +} int scoutfs_data_setup(struct super_block *sb) { @@ -1248,10 +1365,19 @@ int scoutfs_data_setup(struct super_block *sb) if (!datinf) return -ENOMEM; + datinf->sb = sb; init_rwsem(&datinf->alloc_rwsem); + atomic64_set(&datinf->node_free_blocks, 0); + INIT_WORK(&datinf->return_work, + scoutfs_data_return_server_extents_worker); + + datinf->workq = alloc_workqueue("scoutfs_data", 0, 1); + if (!datinf->workq) { + kfree(datinf); + return -ENOMEM; + } sbi->data_info = datinf; - return 0; } @@ -1260,5 +1386,14 @@ void scoutfs_data_destroy(struct super_block *sb) struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct data_info *datinf = sbi->data_info; - kfree(datinf); + if (datinf) { + if (datinf->workq) { + cancel_work_sync(&datinf->return_work); + destroy_workqueue(datinf->workq); + datinf->workq = NULL; + } + + sbi->data_info = NULL; + kfree(datinf); + } } diff --git a/kmod/src/format.h b/kmod/src/format.h index 9addabf5..e1dfc9a2 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -562,6 +562,20 @@ struct scoutfs_net_extent { __le64 len; } __packed; +struct scoutfs_net_extent_list { + __le64 nr; + struct { + __le64 start; + __le64 len; + } __packed extents[0]; +} __packed; + +#define SCOUTFS_NET_EXTENT_LIST_BYTES(nr) \ + offsetof(struct scoutfs_net_extent_list, extents[nr]) + +/* arbitrarily makes a nice ~1k extent list payload */ +#define SCOUTFS_NET_EXTENT_LIST_MAX_NR 64 + /* XXX eventually we'll have net compaction and will need agents to agree */ /* one upper segment and fanout lower segments */ @@ -575,6 +589,7 @@ struct scoutfs_net_extent { enum { SCOUTFS_NET_ALLOC_INODES = 0, SCOUTFS_NET_ALLOC_EXTENT, + SCOUTFS_NET_FREE_EXTENTS, SCOUTFS_NET_ALLOC_SEGNO, SCOUTFS_NET_RECORD_SEGMENT, SCOUTFS_NET_ADVANCE_SEQ, diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index fd3a0f6c..b223d70d 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -1726,6 +1726,14 @@ DEFINE_EVENT(scoutfs_work_class, scoutfs_server_workqueue_destroy, TP_PROTO(struct super_block *sb, u64 data, int ret), TP_ARGS(sb, data, ret) ); +DEFINE_EVENT(scoutfs_work_class, scoutfs_data_return_server_extents_enter, + TP_PROTO(struct super_block *sb, u64 data, int ret), + TP_ARGS(sb, data, ret) +); +DEFINE_EVENT(scoutfs_work_class, scoutfs_data_return_server_extents_exit, + TP_PROTO(struct super_block *sb, u64 data, int ret), + TP_ARGS(sb, data, ret) +); TRACE_EVENT(scoutfs_item_next_range_check, TP_PROTO(struct super_block *sb, int cached, @@ -2149,6 +2157,10 @@ DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_fiemap_extent, TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), TP_ARGS(sb, ext) ); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_return_server_extent, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); DEFINE_EVENT(scoutfs_extent_class, scoutfs_server_alloc_extent_next, TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), TP_ARGS(sb, ext) diff --git a/kmod/src/server.c b/kmod/src/server.c index a88f8721..da284854 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -743,6 +743,54 @@ out: return send_reply(conn, id, type, ret, &nex, sizeof(nex)); } +static bool invalid_net_extent_list(struct scoutfs_net_extent_list *nexl, + unsigned data_len) +{ + return (data_len < sizeof(struct scoutfs_net_extent_list)) || + (le64_to_cpu(nexl->nr) > SCOUTFS_NET_EXTENT_LIST_MAX_NR) || + (data_len != offsetof(struct scoutfs_net_extent_list, + extents[le64_to_cpu(nexl->nr)])); +} + +static int process_free_extents(struct server_connection *conn, + u64 id, u8 type, void *data, unsigned data_len) +{ + struct server_info *server = conn->server; + struct super_block *sb = server->sb; + struct scoutfs_net_extent_list *nexl; + struct commit_waiter cw; + int ret = 0; + int err; + u64 i; + + nexl = data; + if (invalid_net_extent_list(nexl, data_len)) { + ret = -EINVAL; + goto out; + } + + down_read(&server->commit_rwsem); + + for (i = 0; i < le64_to_cpu(nexl->nr); i++) { + ret = free_extent(sb, le64_to_cpu(nexl->extents[i].start), + le64_to_cpu(nexl->extents[i].len)); + if (ret) + break; + } + + if (i > 0) + queue_commit_work(server, &cw); + up_read(&server->commit_rwsem); + + if (i > 0) { + err = wait_for_commit(server, &cw, id, type); + if (ret == 0) + ret = err; + } +out: + return send_reply(conn, id, type, ret, NULL, 0); +} + /* * We still special case segno allocation because it's aligned and we'd * like to keep that detail in the server. @@ -1091,6 +1139,7 @@ static void scoutfs_server_process_func(struct work_struct *work) static process_func_t process_funcs[] = { [SCOUTFS_NET_ALLOC_INODES] = process_alloc_inodes, [SCOUTFS_NET_ALLOC_EXTENT] = process_alloc_extent, + [SCOUTFS_NET_FREE_EXTENTS] = process_free_extents, [SCOUTFS_NET_ALLOC_SEGNO] = process_alloc_segno, [SCOUTFS_NET_RECORD_SEGMENT] = process_record_segment, [SCOUTFS_NET_ADVANCE_SEQ] = process_advance_seq, diff --git a/kmod/src/super.c b/kmod/src/super.c index 8e7e12fd..4279ea96 100644 --- a/kmod/src/super.c +++ b/kmod/src/super.c @@ -120,12 +120,13 @@ static void scoutfs_put_super(struct super_block *sb) sbi->shutdown = true; + scoutfs_data_destroy(sb); + scoutfs_unlock(sb, sbi->node_id_lock, DLM_LOCK_EX); sbi->node_id_lock = NULL; scoutfs_shutdown_trans(sb); scoutfs_client_destroy(sb); - scoutfs_data_destroy(sb); scoutfs_inode_destroy(sb); scoutfs_item_destroy(sb);