diff --git a/kmod/src/client.c b/kmod/src/client.c index dd0d6b9a..1ad2de21 100644 --- a/kmod/src/client.c +++ b/kmod/src/client.c @@ -556,6 +556,25 @@ int scoutfs_client_alloc_inodes(struct super_block *sb, u64 count, return ret; } +int scoutfs_client_alloc_extent(struct super_block *sb, u64 len, u64 *start) +{ + struct client_info *client = SCOUTFS_SB(sb)->client_info; + __le64 lelen = cpu_to_le64(len); + __le64 lestart; + int ret; + + ret = client_request(client, SCOUTFS_NET_ALLOC_EXTENT, + &lelen, sizeof(lelen), &lestart, sizeof(lestart)); + if (ret == 0) { + if (lestart == 0) + ret = -ENOSPC; + else + *start = le64_to_cpu(lestart); + } + + return ret; +} + int scoutfs_client_alloc_segno(struct super_block *sb, u64 *segno) { struct client_info *client = SCOUTFS_SB(sb)->client_info; diff --git a/kmod/src/client.h b/kmod/src/client.h index c2bc3bb9..aa098f02 100644 --- a/kmod/src/client.h +++ b/kmod/src/client.h @@ -3,6 +3,7 @@ int scoutfs_client_alloc_inodes(struct super_block *sb, u64 count, u64 *ino, u64 *nr); +int scoutfs_client_alloc_extent(struct super_block *sb, u64 len, u64 *start); int scoutfs_client_alloc_segno(struct super_block *sb, u64 *segno); int scoutfs_client_record_segment(struct super_block *sb, struct scoutfs_segment *seg, u8 level); diff --git a/kmod/src/compact.c b/kmod/src/compact.c index 4047e5e7..7ae99183 100644 --- a/kmod/src/compact.c +++ b/kmod/src/compact.c @@ -494,7 +494,7 @@ void scoutfs_compact_add_segno(struct super_block *sb, void *data, u64 segno) /* * Commit the result of a compaction based on the state of the cursor. - * The net caller stops the manifest from being written while we're + * The server caller stops the manifest from being written while we're * making changes. We lock the manifest to atomically make our changes. * * The erorr handling is sketchy here because calling the manifest from @@ -513,7 +513,7 @@ int scoutfs_compact_commit(struct super_block *sb, void *c, void *r) /* free unused segnos that were allocated for the compaction */ for (i = 0; i < curs->nr_segnos; i++) { if (curs->segnos[i]) { - ret = scoutfs_alloc_free(sb, curs->segnos[i]); + ret = scoutfs_server_free_segno(sb, curs->segnos[i]); BUG_ON(ret); } } @@ -523,7 +523,7 @@ int scoutfs_compact_commit(struct super_block *sb, void *c, void *r) /* delete input segments, probably freeing their segnos */ list_for_each_entry(cseg, &curs->csegs, entry) { if (!cseg->part_of_move) { - ret = scoutfs_alloc_free(sb, cseg->segno); + ret = scoutfs_server_free_segno(sb, cseg->segno); BUG_ON(ret); } diff --git a/kmod/src/count.h b/kmod/src/count.h index 863a789c..95aed312 100644 --- a/kmod/src/count.h +++ b/kmod/src/count.h @@ -211,8 +211,7 @@ static inline const struct scoutfs_item_count SIC_XATTR_SET(unsigned old_parts, static inline const struct scoutfs_item_count SIC_WRITE_BEGIN(void) { struct scoutfs_item_count cnt = {0,}; - unsigned nr_free = (SCOUTFS_BULK_ALLOC_COUNT + - SCOUTFS_BLOCKS_PER_PAGE) * 3; + unsigned nr_free = (1 + SCOUTFS_BLOCKS_PER_PAGE) * 3; unsigned nr_file = (DIV_ROUND_UP(SCOUTFS_BLOCKS_PER_PAGE, 2) + SCOUTFS_BLOCKS_PER_PAGE) * 3; diff --git a/kmod/src/counters.h b/kmod/src/counters.h index 9d6cfc04..b07b5e46 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -101,6 +101,13 @@ EXPAND_COUNTER(seg_free) \ EXPAND_COUNTER(seg_shrink) \ EXPAND_COUNTER(seg_stale_read) \ + EXPAND_COUNTER(server_alloc_segno) \ + EXPAND_COUNTER(server_extent_alloc) \ + EXPAND_COUNTER(server_extent_alloc_error) \ + EXPAND_COUNTER(server_free_extent) \ + EXPAND_COUNTER(server_free_pending_extent) \ + EXPAND_COUNTER(server_free_pending_error) \ + EXPAND_COUNTER(server_free_segno) \ EXPAND_COUNTER(trans_commit_fsync) \ EXPAND_COUNTER(trans_commit_full) \ EXPAND_COUNTER(trans_commit_item_flush) \ diff --git a/kmod/src/data.c b/kmod/src/data.c index e0d273df..12a39425 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -456,38 +456,24 @@ static struct task_cursor *get_cursor(struct data_info *datinf) return curs; } -static int bulk_alloc(struct super_block *sb) +static int get_server_extent(struct super_block *sb, u64 len) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_extent ext; - u64 *segnos = NULL; - int ret = 0; - int i; + u64 start; + int ret; - segnos = scoutfs_client_bulk_alloc(sb); - if (IS_ERR(segnos)) { - ret = PTR_ERR(segnos); + ret = scoutfs_client_alloc_extent(sb, len, &start); + if (ret) goto out; - } - for (i = 0; segnos[i]; i++) { - scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, - sbi->node_id, - segnos[i] << SCOUTFS_SEGMENT_BLOCK_SHIFT, - SCOUTFS_SEGMENT_BLOCKS, 0, 0); - trace_scoutfs_data_bulk_alloc(sb, &ext); - ret = scoutfs_extent_add(sb, data_extent_io, &ext, - sbi->node_id_lock); - if (ret) - break; - } + scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, + sbi->node_id, start, len, 0, 0); + trace_scoutfs_data_get_server_extent(sb, &ext); + ret = scoutfs_extent_add(sb, data_extent_io, &ext, sbi->node_id_lock); + /* XXX don't free extent on error, crash recovery with server */ out: - if (!IS_ERR_OR_NULL(segnos)) - kfree(segnos); - - /* XXX don't orphan segnos on error, crash recovery with server */ - return ret; } @@ -500,8 +486,10 @@ out: * that track large extents. Each new allocating task will get a new * extent. */ -/* XXX initially tied to segment size, should be a lot larger */ -#define LARGE_EXTENT_BLOCKS SCOUTFS_SEGMENT_BLOCKS +#define CURSOR_BLOCKS (1 * 1024 * 1024 / BLOCK_SIZE) +#define CURSOR_BLOCKS_MASK (CURSOR_BLOCKS - 1) +#define CURSOR_BLOCKS_SEARCH (CURSOR_BLOCKS + CURSOR_BLOCKS - 1) +#define CURSOR_BLOCKS_ALLOC (CURSOR_BLOCKS * 64) static int find_alloc_block(struct super_block *sb, struct inode *inode, u64 iblock, bool was_offline, struct scoutfs_lock *lock) @@ -543,16 +531,26 @@ static int find_alloc_block(struct super_block *sb, struct inode *inode, } /* try to find a new large extent, possibly asking for more */ - while (curs->blkno == 0) { + if (curs->blkno == 0) { scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLOCKS_TYPE, - sbi->node_id, 0, 2 * LARGE_EXTENT_BLOCKS, + sbi->node_id, 0, CURSOR_BLOCKS_SEARCH, 0, 0); ret = scoutfs_extent_next(sb, data_extent_io, &ext, sbi->node_id_lock); - if (ret && ret != -ENOENT) + if (ret == -ENOENT) { + /* try to get allocation from the server if we're out */ + ret = get_server_extent(sb, CURSOR_BLOCKS_ALLOC); + if (ret == 0) + ret = scoutfs_extent_next(sb, data_extent_io, + &ext, + sbi->node_id_lock); + } + if (ret) { + /* XXX should try to look for smaller free extents :/ */ + if (ret == -ENOENT) + ret = -ENOSPC; goto out; - - /* XXX should try to look for smaller free extents :/ */ + } /* * set our cursor to the aligned start of a large extent @@ -561,19 +559,10 @@ static int find_alloc_block(struct super_block *sb, struct inode *inode, * constantly setting cursors to the start of a large * free extent that keeps have its start allocated. */ - if (ret == 0) { - trace_scoutfs_data_alloc_block_free(sb, &ext); - curs->blkno = ALIGN(ext.start, LARGE_EXTENT_BLOCKS); - break; - } - - /* try to get allocation from the server if we're out */ - ret = bulk_alloc(sb); - if (ret < 0) - goto out; + trace_scoutfs_data_alloc_block_free(sb, &ext); + curs->blkno = ALIGN(ext.start, CURSOR_BLOCKS); } - /* remove the free block we're using */ scoutfs_extent_init(&fr, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, sbi->node_id, curs->blkno, 1, 0, 0); @@ -603,9 +592,9 @@ static int find_alloc_block(struct super_block *sb, struct inode *inode, scoutfs_inode_add_onoff(inode, 1, was_offline ? -1ULL : 0); /* set cursor to next block, clearing if we finish a large extent */ - BUILD_BUG_ON(!is_power_of_2(LARGE_EXTENT_BLOCKS)); + BUILD_BUG_ON(!is_power_of_2(CURSOR_BLOCKS)); curs->blkno++; - if ((curs->blkno & (LARGE_EXTENT_BLOCKS - 1)) == 0) + if ((curs->blkno & CURSOR_BLOCKS_MASK) == 0) curs->blkno = 0; ret = 0; diff --git a/kmod/src/format.h b/kmod/src/format.h index 03bcd856..f0a28347 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -236,6 +236,19 @@ struct scoutfs_manifest_btree_val { struct scoutfs_key last_key; } __packed; +/* + * Free extents are stored in the server in an allocation btree. The + * type differentiates whether start or length is in stored in the major + * value and is the primary sort key. 'start' is set to the final block + * in the extent so that overlaping queries can be done with next + * instead prev. + */ +struct scoutfs_extent_btree_key { + __u8 type; + __be64 major; + __be64 minor; +} __packed; + #define SCOUTFS_ALLOC_REGION_SHIFT 8 #define SCOUTFS_ALLOC_REGION_BITS (1 << SCOUTFS_ALLOC_REGION_SHIFT) #define SCOUTFS_ALLOC_REGION_MASK (SCOUTFS_ALLOC_REGION_BITS - 1) @@ -303,7 +316,7 @@ struct scoutfs_segment_block { #define SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE 2 #define SCOUTFS_INODE_INDEX_NR 3 /* don't forget to update */ -/* node zone */ +/* node zone (also used in server alloc btree) */ #define SCOUTFS_FREE_EXTENT_BLKNO_TYPE 1 #define SCOUTFS_FREE_EXTENT_BLOCKS_TYPE 2 @@ -370,6 +383,9 @@ struct scoutfs_super_block { __le64 alloc_uninit; __le64 total_segs; __le64 free_segs; + __le64 total_blocks; + __le64 free_blocks; + __le64 alloc_cursor; struct scoutfs_btree_ring bring; __le64 next_seg_seq; struct scoutfs_btree_root alloc_root; @@ -564,9 +580,9 @@ struct scoutfs_net_segnos { } __packed; struct scoutfs_net_statfs { - __le64 total_segs; /* total segments in device */ + __le64 total_blocks; /* total blocks in device */ __le64 next_ino; /* next unused inode number */ - __le64 bfree; /* total free small blocks */ + __le64 bfree; /* free blocks */ __u8 uuid[SCOUTFS_UUID_BYTES]; /* logical volume uuid */ } __packed; @@ -582,6 +598,7 @@ struct scoutfs_net_statfs { enum { SCOUTFS_NET_ALLOC_INODES = 0, + SCOUTFS_NET_ALLOC_EXTENT, SCOUTFS_NET_ALLOC_SEGNO, SCOUTFS_NET_RECORD_SEGMENT, SCOUTFS_NET_BULK_ALLOC, diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 7929bb54..2c435714 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -2117,7 +2117,7 @@ DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_truncate_offline, TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), TP_ARGS(sb, ext) ); -DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_bulk_alloc, +DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_get_server_extent, TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), TP_ARGS(sb, ext) ); @@ -2145,6 +2145,30 @@ DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_fiemap_extent, TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), TP_ARGS(sb, ext) ); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_server_alloc_extent_next, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_server_alloc_extent_allocated, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_server_alloc_segno_next, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_server_alloc_segno_allocated, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_server_free_pending_extent, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_server_extent_io, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); TRACE_EVENT(scoutfs_online_offline_blocks, TP_PROTO(struct inode *inode, s64 on_delta, s64 off_delta, @@ -2173,6 +2197,33 @@ TRACE_EVENT(scoutfs_online_offline_blocks, __entry->on_now, __entry->off_now) ); +DECLARE_EVENT_CLASS(scoutfs_segno_class, + TP_PROTO(struct super_block *sb, u64 segno), + + TP_ARGS(sb, segno), + + TP_STRUCT__entry( + __field(__u64, fsid) + __field(__s64, segno) + ), + + TP_fast_assign( + __entry->fsid = FSID_ARG(sb); + __entry->segno = segno; + ), + + TP_printk("fsid "FSID_FMT" segno %llu", + __entry->fsid, __entry->segno) +); +DEFINE_EVENT(scoutfs_segno_class, scoutfs_alloc_segno, + TP_PROTO(struct super_block *sb, u64 segno), + TP_ARGS(sb, segno) +); +DEFINE_EVENT(scoutfs_segno_class, scoutfs_free_segno, + TP_PROTO(struct super_block *sb, u64 segno), + TP_ARGS(sb, segno) +); + #endif /* _TRACE_SCOUTFS_H */ /* This part must be outside protection */ diff --git a/kmod/src/seg.c b/kmod/src/seg.c index fea5ea4f..3b402f93 100644 --- a/kmod/src/seg.c +++ b/kmod/src/seg.c @@ -28,6 +28,7 @@ #include "counters.h" #include "triggers.h" #include "msg.h" +#include "server.h" #include "scoutfs_trace.h" /* @@ -298,7 +299,7 @@ out: */ int scoutfs_seg_free_segno(struct super_block *sb, struct scoutfs_segment *seg) { - return scoutfs_alloc_free(sb, seg->segno); + return scoutfs_server_free_segno(sb, seg->segno); } /* diff --git a/kmod/src/server.c b/kmod/src/server.c index d5317d04..64c862c1 100644 --- a/kmod/src/server.c +++ b/kmod/src/server.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "format.h" #include "counters.h" @@ -67,6 +68,10 @@ struct server_info { /* server tracks seq use */ spinlock_t seq_lock; struct list_head pending_seqs; + + /* server tracks pending frees to be applied during commit */ + struct rw_semaphore alloc_rwsem; + struct list_head pending_frees; }; struct server_request { @@ -93,6 +98,350 @@ struct commit_waiter { int ret; }; +static void init_extent_btree_key(struct scoutfs_extent_btree_key *ebk, + u8 type, u64 major, u64 minor) +{ + ebk->type = type; + ebk->major = cpu_to_be64(major); + ebk->minor = cpu_to_be64(minor); +} + +static int init_extent_from_btree_key(struct scoutfs_extent *ext, u8 type, + struct scoutfs_extent_btree_key *ebk, + unsigned int key_bytes) +{ + u64 start; + u64 len; + + /* btree _next doesn't have last key limit */ + if (ebk->type != type) + return -ENOENT; + + if (key_bytes != sizeof(struct scoutfs_extent_btree_key) || + (ebk->type != SCOUTFS_FREE_EXTENT_BLKNO_TYPE && + ebk->type != SCOUTFS_FREE_EXTENT_BLOCKS_TYPE)) + return -EIO; /* XXX corruption, bad key */ + + start = be64_to_cpu(ebk->major); + len = be64_to_cpu(ebk->minor); + if (ebk->type == SCOUTFS_FREE_EXTENT_BLOCKS_TYPE) + swap(start, len); + start -= len - 1; + + return scoutfs_extent_init(ext, ebk->type, 0, start, len, 0, 0); +} + +/* + * This is called by the extent core on behalf of the server who holds + * the appropriate locks to protect the many btree items that can be + * accessed on behalf of one extent operation. + */ +static int server_extent_io(struct super_block *sb, int op, + struct scoutfs_extent *ext, void *data) +{ + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + struct scoutfs_extent_btree_key ebk; + SCOUTFS_BTREE_ITEM_REF(iref); + bool mirror = false; + u8 mirror_type; + u8 mirror_op = 0; + int ret; + int err; + + trace_scoutfs_server_extent_io(sb, ext); + + if (WARN_ON_ONCE(ext->type != SCOUTFS_FREE_EXTENT_BLKNO_TYPE && + ext->type != SCOUTFS_FREE_EXTENT_BLOCKS_TYPE)) + return -EINVAL; + + if (ext->type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE && + (op == SEI_INSERT || op == SEI_DELETE)) { + mirror = true; + mirror_type = SCOUTFS_FREE_EXTENT_BLOCKS_TYPE; + mirror_op = op == SEI_INSERT ? SEI_DELETE : SEI_INSERT; + } + + init_extent_btree_key(&ebk, ext->type, ext->start + ext->len - 1, + ext->len); + if (ext->type == SCOUTFS_FREE_EXTENT_BLOCKS_TYPE) + swap(ebk.major, ebk.minor); + + if (op == SEI_NEXT) { + ret = scoutfs_btree_next(sb, &super->alloc_root, + &ebk, sizeof(ebk), &iref); + if (ret == 0) { + ret = init_extent_from_btree_key(ext, ext->type, + iref.key, + iref.key_len); + scoutfs_btree_put_iref(&iref); + } + + } else if (op == SEI_INSERT) { + ret = scoutfs_btree_insert(sb, &super->alloc_root, + &ebk, sizeof(ebk), NULL, 0); + + } else if (op == SEI_DELETE) { + ret = scoutfs_btree_delete(sb, &super->alloc_root, + &ebk, sizeof(ebk)); + + } else { + ret = WARN_ON_ONCE(-EINVAL); + } + + if (ret == 0 && mirror) { + swap(ext->type, mirror_type); + ret = server_extent_io(sb, op, ext, data); + swap(ext->type, mirror_type); + if (ret) { + err = server_extent_io(sb, mirror_op, ext, data); + BUG_ON(err); + } + } + + return ret; +} + +/* + * Allocate an extent of the given length in the first smallest free + * extent that contains it. We allocate in multiples of segment blocks + * and expose that to callers today. + * + * This doesn't have the cursor that segment allocation does. It's + * possible that a recently freed segment can merge to form a larger + * free extent that can be very quickly allocated to a node. The hope is + * that doesn't happen very often. + */ +static int alloc_extent(struct super_block *sb, u64 len, u64 *start) +{ + struct server_info *server = SCOUTFS_SB(sb)->server_info; + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + struct scoutfs_extent ext; + int ret; + + *start = 0; + + down_write(&server->alloc_rwsem); + + if (len & (SCOUTFS_SEGMENT_BLOCKS - 1)) { + ret = -EINVAL; + goto out; + } + + scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLOCKS_TYPE, 0, + 0, len, 0, 0); + ret = scoutfs_extent_next(sb, server_extent_io, &ext, NULL); + if (ret) { + if (ret == -ENOENT) + ret = -ENOSPC; + goto out; + } + + trace_scoutfs_server_alloc_extent_next(sb, &ext); + + ext.type = SCOUTFS_FREE_EXTENT_BLKNO_TYPE; + ext.len = len; + + ret = scoutfs_extent_remove(sb, server_extent_io, &ext, NULL); + if (ret) + goto out; + + trace_scoutfs_server_alloc_extent_allocated(sb, &ext); + le64_add_cpu(&super->free_blocks, -ext.len); + + *start = ext.start; + ret = 0; + +out: + up_write(&server->alloc_rwsem); + + if (ret) + scoutfs_inc_counter(sb, server_extent_alloc_error); + else + scoutfs_inc_counter(sb, server_extent_alloc); + + return ret; +} + +struct pending_free_extent { + struct list_head head; + u64 start; + u64 len; +}; + +/* + * Now that the transaction's done we can apply all the pending frees. + * The list entries are totally unsorted so this is the first time that + * we can discover corruption from duplicated frees, etc. This can also + * fail on normal transient io or memory errors. + * + * We can't unwind if this fails. The caller can freak out or keep + * trying forever. + */ +static int apply_pending_frees(struct super_block *sb) +{ + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + struct server_info *server = SCOUTFS_SB(sb)->server_info; + struct pending_free_extent *pfe; + struct pending_free_extent *tmp; + struct scoutfs_extent ext; + int ret; + + down_write(&server->alloc_rwsem); + + list_for_each_entry_safe(pfe, tmp, &server->pending_frees, head) { + scoutfs_inc_counter(sb, server_free_pending_extent); + scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, 0, + pfe->start, pfe->len, 0, 0); + trace_scoutfs_server_free_pending_extent(sb, &ext); + ret = scoutfs_extent_add(sb, server_extent_io, &ext, NULL); + if (ret) { + scoutfs_inc_counter(sb, server_free_pending_error); + break; + } + + le64_add_cpu(&super->free_blocks, pfe->len); + list_del_init(&pfe->head); + kfree(pfe); + } + + up_write(&server->alloc_rwsem); + + return 0; +} + +/* + * If there are still pending frees to destroy it means the server didn't + * shut down cleanly and that's not well supported today so we want to + * have it holler if this happens. In the future we'd cleanly support + * forced shutdown that had been told that it's OK to throw away dirty + * state. + */ +static int destroy_pending_frees(struct super_block *sb) +{ + struct server_info *server = SCOUTFS_SB(sb)->server_info; + struct pending_free_extent *pfe; + struct pending_free_extent *tmp; + + WARN_ON_ONCE(!list_empty(&server->pending_frees)); + + down_write(&server->alloc_rwsem); + + list_for_each_entry_safe(pfe, tmp, &server->pending_frees, head) { + list_del_init(&pfe->head); + kfree(pfe); + } + + up_write(&server->alloc_rwsem); + + return 0; +} + +/* + * We can't satisfy allocations with freed extents until the removed + * references to the freed extents have been committed. We add freed + * extents to a list that is only applied to the persistent indexes as + * the transaction is being committed and the current transaction won't + * try to allocate any more extents. If we didn't do this then we could + * write to referenced data as part of the commit that frees it. If the + * commit was interrupted the stable data could have been overwritten. + */ +static int free_extent(struct super_block *sb, u64 start, u64 len) +{ + struct server_info *server = SCOUTFS_SB(sb)->server_info; + struct pending_free_extent *pfe; + int ret; + + scoutfs_inc_counter(sb, server_free_extent); + + down_write(&server->alloc_rwsem); + + pfe = kmalloc(sizeof(struct pending_free_extent), GFP_NOFS); + if (!pfe) { + ret = -ENOMEM; + } else { + pfe->start = start; + pfe->len = len; + list_add_tail(&pfe->head, &server->pending_frees); + ret = 0; + } + + up_write(&server->alloc_rwsem); + + return ret; +} + +/* + * This is called by the compaction code which is running in the server. + * The server caller has held all the locks, etc. + */ +int scoutfs_server_free_segno(struct super_block *sb, u64 segno) +{ + scoutfs_inc_counter(sb, server_free_segno); + trace_scoutfs_free_segno(sb, segno); + return free_extent(sb, segno << SCOUTFS_SEGMENT_BLOCK_SHIFT, + SCOUTFS_SEGMENT_BLOCKS); +} + +/* + * Allocate a segment on behalf of compaction or a node wanting to write + * a level 0 segment. It has to be aligned to the segment size because + * we address segments with aligned segment numbers instead of block + * offsets. + * + * We can use a simple cursor sweep of the index by start because all + * server extents are multiples of the segment size. Sweeping through + * the volume tries to spread out new segment writes and make it more + * rare to write to a recently freed segment which can cause a client to + * have to re-read the manifest. + */ +static int alloc_segno(struct super_block *sb, u64 *segno) +{ + struct server_info *server = SCOUTFS_SB(sb)->server_info; + struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; + struct scoutfs_extent ext; + u64 curs; + int ret; + + down_write(&server->alloc_rwsem); + + curs = ALIGN(le64_to_cpu(super->alloc_cursor), SCOUTFS_SEGMENT_BLOCKS); + *segno = 0; + + do { + scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, 0, + curs, 1, 0, 0); + ret = scoutfs_extent_next(sb, server_extent_io, &ext, NULL); + } while (ret == -ENOENT && curs && (curs = 0, 1)); + if (ret) { + if (ret == -ENOENT) + ret = -ENOSPC; + goto out; + } + + trace_scoutfs_server_alloc_segno_next(sb, &ext); + + /* use cursor if within extent, otherwise start of next extent */ + if (ext.start < curs) + ext.start = curs; + ext.len = SCOUTFS_SEGMENT_BLOCKS; + + ret = scoutfs_extent_remove(sb, server_extent_io, &ext, NULL); + if (ret) + goto out; + + super->alloc_cursor = cpu_to_le64(ext.start + ext.len); + + *segno = ext.start >> SCOUTFS_SEGMENT_BLOCK_SHIFT; + + trace_scoutfs_server_alloc_segno_allocated(sb, &ext); + trace_scoutfs_alloc_segno(sb, *segno); + scoutfs_inc_counter(sb, server_alloc_segno); + +out: + up_write(&server->alloc_rwsem); + return ret; +} + /* * Trigger a server shutdown by shutting down the listening socket. The * server thread will break out of accept and exit. @@ -188,9 +537,9 @@ static void scoutfs_server_commit_func(struct work_struct *work) goto out; } - ret = scoutfs_alloc_apply_pending(sb); + ret = apply_pending_frees(sb); if (ret) { - scoutfs_err(sb, "server error freeing segments: %d", ret); + scoutfs_err(sb, "server error freeing extents: %d", ret); goto out; } @@ -336,6 +685,50 @@ out: return send_reply(conn, id, type, ret, &ial, sizeof(ial)); } +/* + * Give the client an extent allocation of len blocks. We leave the + * details to the extent allocator. + */ +static int process_alloc_extent(struct server_connection *conn, + u64 id, u8 type, void *data, unsigned data_len) +{ + struct server_info *server = conn->server; + struct super_block *sb = server->sb; + struct commit_waiter cw; + __le64 lestart; + __le64 lelen; + u64 start; + int ret; + + if (data_len != sizeof(lelen)) { + ret = -EINVAL; + goto out; + } + + memcpy(&lelen, data, data_len); + + down_read(&server->commit_rwsem); + ret = alloc_extent(sb, le64_to_cpu(lelen), &start); + if (ret == -ENOSPC) { + start = 0; + ret = 0; + } + if (ret == 0) { + lestart = cpu_to_le64(start); + queue_commit_work(server, &cw); + } + up_read(&server->commit_rwsem); + + if (ret == 0) + ret = wait_for_commit(server, &cw, id, type); +out: + return send_reply(conn, id, type, ret, &lestart, sizeof(lestart)); +} + +/* + * We still special case segno allocation because it's aligned and we'd + * like to keep that detail in the server. + */ static int process_alloc_segno(struct server_connection *conn, u64 id, u8 type, void *data, unsigned data_len) { @@ -352,7 +745,7 @@ static int process_alloc_segno(struct server_connection *conn, } down_read(&server->commit_rwsem); - ret = scoutfs_alloc_segno(sb, &segno); + ret = alloc_segno(sb, &segno); if (ret == 0) { lesegno = cpu_to_le64(segno); queue_commit_work(server, &cw); @@ -607,14 +1000,15 @@ static int process_statfs(struct server_connection *conn, u64 id, u8 type, if (data_len == 0) { /* uuid and total_segs are constant, so far */ memcpy(nstatfs.uuid, super->uuid, sizeof(nstatfs.uuid)); - nstatfs.total_segs = super->total_segs; spin_lock(&sbi->next_ino_lock); nstatfs.next_ino = super->next_ino; spin_unlock(&sbi->next_ino_lock); - /* alloc locks the bfree calculation */ - nstatfs.bfree = cpu_to_le64(scoutfs_alloc_bfree(sb)); + down_read(&server->alloc_rwsem); + nstatfs.total_blocks = super->total_blocks; + nstatfs.bfree = super->free_blocks; + up_read(&server->alloc_rwsem); ret = 0; } else { ret = -EINVAL; @@ -657,7 +1051,7 @@ int scoutfs_client_get_compaction(struct super_block *sb, void *curs) /* allow for expansion slop from sticky and alignment */ for (i = 0; i < nr + SCOUTFS_COMPACTION_SLOP; i++) { - ret = scoutfs_alloc_segno(sb, &segno); + ret = alloc_segno(sb, &segno); if (ret < 0) break; scoutfs_compact_add_segno(sb, curs, segno); @@ -728,6 +1122,7 @@ static void scoutfs_server_process_func(struct work_struct *work) struct server_connection *conn = req->conn; static process_func_t process_funcs[] = { [SCOUTFS_NET_ALLOC_INODES] = process_alloc_inodes, + [SCOUTFS_NET_ALLOC_EXTENT] = process_alloc_extent, [SCOUTFS_NET_ALLOC_SEGNO] = process_alloc_segno, [SCOUTFS_NET_RECORD_SEGMENT] = process_record_segment, [SCOUTFS_NET_BULK_ALLOC] = process_bulk_alloc, @@ -994,7 +1389,6 @@ static void scoutfs_server_func(struct work_struct *work) /* finally start up the server subsystems before accepting */ ret = scoutfs_btree_setup(sb) ?: scoutfs_manifest_setup(sb) ?: - scoutfs_alloc_setup(sb) ?: scoutfs_compact_setup(sb); if (ret) goto shutdown; @@ -1067,7 +1461,7 @@ shutdown: /* shut down all the server subsystems */ scoutfs_compact_destroy(sb); - scoutfs_alloc_destroy(sb); + destroy_pending_frees(sb); scoutfs_manifest_destroy(sb); scoutfs_btree_destroy(sb); @@ -1108,6 +1502,8 @@ int scoutfs_server_setup(struct super_block *sb) seqcount_init(&server->stable_seqcount); spin_lock_init(&server->seq_lock); INIT_LIST_HEAD(&server->pending_seqs); + init_rwsem(&server->alloc_rwsem); + INIT_LIST_HEAD(&server->pending_frees); server->wq = alloc_workqueue("scoutfs_server", WQ_NON_REENTRANT, 0); if (!server->wq) { diff --git a/kmod/src/server.h b/kmod/src/server.h index f6e076db..bbc73901 100644 --- a/kmod/src/server.h +++ b/kmod/src/server.h @@ -9,6 +9,7 @@ void scoutfs_init_ment_from_net(struct scoutfs_manifest_entry *ment, int scoutfs_client_get_compaction(struct super_block *sb, void *curs); int scoutfs_client_finish_compaction(struct super_block *sb, void *curs, void *list); +int scoutfs_server_free_segno(struct super_block *sb, u64 segno); int scoutfs_server_setup(struct super_block *sb); void scoutfs_server_destroy(struct super_block *sb); diff --git a/kmod/src/super.c b/kmod/src/super.c index 915e33fe..8b728d22 100644 --- a/kmod/src/super.c +++ b/kmod/src/super.c @@ -74,8 +74,7 @@ static int scoutfs_statfs(struct dentry *dentry, struct kstatfs *kst) kst->f_bfree = le64_to_cpu(nstatfs.bfree); kst->f_type = SCOUTFS_SUPER_MAGIC; kst->f_bsize = SCOUTFS_BLOCK_SIZE; - kst->f_blocks = le64_to_cpu(nstatfs.total_segs) * - SCOUTFS_SEGMENT_BLOCKS; + kst->f_blocks = le64_to_cpu(nstatfs.total_blocks); kst->f_bavail = kst->f_bfree; kst->f_ffree = kst->f_bfree * 16;