diff --git a/kmod/src/count.h b/kmod/src/count.h index 759c736d..799c2e31 100644 --- a/kmod/src/count.h +++ b/kmod/src/count.h @@ -201,19 +201,25 @@ static inline const struct scoutfs_item_count SIC_XATTR_SET(unsigned old_parts, } /* - * write_begin can add local free segment items, modify another to - * alloc, add a free blkno item, and modify dirty the mapping. + * write_begin can have to allocate all the blocks in the page and can + * have to add a big allocation from the server to do so: + * - merge added free extents from the server + * - remove a free extent per block + * - remove an offline extent for every other block + * - add a file extent per block */ static inline const struct scoutfs_item_count SIC_WRITE_BEGIN(void) { struct scoutfs_item_count cnt = {0,}; - unsigned nr_free = SCOUTFS_BULK_ALLOC_COUNT + 1 + 1; + unsigned nr_free = (SCOUTFS_BULK_ALLOC_COUNT + + SCOUTFS_BLOCKS_PER_PAGE) * 3; + unsigned nr_file = (DIV_ROUND_UP(SCOUTFS_BLOCKS_PER_PAGE, 2) + + SCOUTFS_BLOCKS_PER_PAGE) * 3; __count_dirty_inode(&cnt); - cnt.items += 1 + nr_free; - cnt.vals += SCOUTFS_BLOCK_MAPPING_MAX_BYTES + - (nr_free * sizeof(struct scoutfs_free_bits)); + cnt.items += nr_free + nr_file; + cnt.vals += nr_file * sizeof(struct scoutfs_file_extent); return cnt; } @@ -235,4 +241,24 @@ static inline const struct scoutfs_item_count SIC_TRUNC_BLOCK(void) return cnt; } +/* + * Truncating an extent can: + * - delete existing file extent, + * - create two surrounding file extents, + * - add an offline file extent, + * - delete two existing free extents + * - create a merged free extent + */ +static inline const struct scoutfs_item_count SIC_TRUNC_EXTENT(void) +{ + struct scoutfs_item_count cnt = {0,}; + unsigned int nr_file = 1 + 2 + 1; + unsigned int nr_free = (2 + 1) * 2; + + cnt.items += nr_file + nr_free; + cnt.vals += nr_file * sizeof(struct scoutfs_file_extent); + + return cnt; +} + #endif diff --git a/kmod/src/data.c b/kmod/src/data.c index fb4d19bb..4343a90d 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "format.h" #include "super.h" @@ -34,18 +35,11 @@ #include "client.h" #include "lock.h" #include "file.h" +#include "extents.h" /* - * scoutfs uses block mapping items at a fixed granularity to describe - * file data block allocations. - * - * Each item describes a fixed number of blocks. To keep the overhead - * of the items down the series of mapped blocks is encoded. The - * mapping items also describe offline blocks. They can only be written - * to newly allocated blocks with the staging ioctl. - * - * Free segnos and blocks are kept in bitmap items that are private to - * nodes so they can be modified without cluster locks. + * scoutfs uses extent items to track file data block mappings and free + * blocks. * * Block allocation maintains a fixed number of allocation cursors that * remember the position of tasks within free regions. This is very @@ -90,6 +84,7 @@ struct task_cursor { pid_t pid; }; +#if 0 /* * Block mapping items and their native decoded form can be pretty big. * Let's allocate them to avoid blowing the stack. @@ -569,6 +564,255 @@ out: for (i = iblock & SCOUTFS_BLOCK_MAPPING_MASK; \ i < SCOUTFS_BLOCK_MAPPING_BLOCKS && iblock <= (last); \ i++, iblock++) +#endif + +static void init_file_extent_key(struct scoutfs_key *key, u64 ino, u64 last) +{ + *key = (struct scoutfs_key) { + .sk_zone = SCOUTFS_FS_ZONE, + .skfe_ino = cpu_to_le64(ino), + .sk_type = SCOUTFS_FILE_EXTENT_TYPE, + .skfe_last = cpu_to_le64(last), + }; +} + +static void init_free_extent_key(struct scoutfs_key *key, u8 type, u64 node_id, + u64 major, u64 minor) +{ + *key = (struct scoutfs_key) { + .sk_zone = SCOUTFS_NODE_ZONE, + .sknf_node_id = cpu_to_le64(node_id), + .sk_type = type, + .sknf_major = cpu_to_le64(major), + .sknf_minor = cpu_to_le64(minor), + }; +} + +static int init_extent_from_item(struct scoutfs_extent *ext, + struct scoutfs_key *key, + struct scoutfs_file_extent *fex) +{ + u64 owner; + u64 start; + u64 map; + u64 len; + u8 flags; + + if (key->sk_type != SCOUTFS_FILE_EXTENT_TYPE && + key->sk_type != SCOUTFS_FREE_EXTENT_BLKNO_TYPE && + key->sk_type != SCOUTFS_FREE_EXTENT_BLOCKS_TYPE) + return -EIO; /* XXX corruption, unknown key type */ + + if (key->sk_type == SCOUTFS_FILE_EXTENT_TYPE) { + owner = le64_to_cpu(key->skfe_ino); + len = le64_to_cpu(fex->len); + start = le64_to_cpu(key->skfe_last) - len + 1; + map = le64_to_cpu(fex->blkno); + flags = fex->flags; + + } else { + owner = le64_to_cpu(key->sknf_node_id); + start = le64_to_cpu(key->sknf_major); + len = le64_to_cpu(key->sknf_minor); + if (key->sk_type == SCOUTFS_FREE_EXTENT_BLOCKS_TYPE) + swap(start, len); + start -= len - 1; + map = 0; + flags = 0; + } + + return scoutfs_extent_init(ext, key->sk_type, owner, start, len, map, + flags); +} + +/* + * Read and write file extent and free extent items. + * + * File extents and free extents are indexed by the last position in the + * extent so that we can find intersections with _next. + * + * We also index free extents by their length. We implement that by + * keeping their _BLOCKS_ item in sync with the primary _BLKNO_ item + * that callers operate on. + */ +static int data_extent_io(struct super_block *sb, int op, + struct scoutfs_extent *ext, void *data) +{ + struct scoutfs_lock *lock = data; + struct scoutfs_file_extent fex; + struct scoutfs_key last; + struct scoutfs_key key; + struct kvec val; + bool mirror = false; + u8 mirror_type; + u8 mirror_op = 0; + int expected; + int ret; + int err; + + if (WARN_ON_ONCE(ext->type != SCOUTFS_FILE_EXTENT_TYPE && + ext->type != SCOUTFS_FREE_EXTENT_BLKNO_TYPE && + ext->type != SCOUTFS_FREE_EXTENT_BLOCKS_TYPE)) + return -EINVAL; + + if (ext->type == SCOUTFS_FREE_EXTENT_BLKNO_TYPE && + (op == SEI_INSERT || op == SEI_DELETE)) { + mirror = true; + mirror_type = SCOUTFS_FREE_EXTENT_BLOCKS_TYPE; + mirror_op = op == SEI_INSERT ? SEI_DELETE : SEI_INSERT; + } + + if (ext->type == SCOUTFS_FILE_EXTENT_TYPE) { + init_file_extent_key(&key, ext->owner, + ext->start + ext->len - 1); + init_file_extent_key(&last, ext->owner, U64_MAX); + fex.blkno = cpu_to_le64(ext->map); + fex.len = cpu_to_le64(ext->len); + fex.flags = ext->flags; + kvec_init(&val, &fex, sizeof(fex)); + } else { + init_free_extent_key(&key, ext->type, ext->owner, + ext->start + ext->len - 1, ext->len); + if (ext->type == SCOUTFS_FREE_EXTENT_BLOCKS_TYPE) + swap(key.sknf_major, key.sknf_minor); + init_free_extent_key(&last, ext->type, ext->owner, + U64_MAX, U64_MAX); + kvec_init(&val, NULL, 0); + } + + if (op == SEI_NEXT) { + expected = val.iov_len; + ret = scoutfs_item_next(sb, &key, &last, &val, lock); + if (ret >= 0 && ret != expected) + ret = -EIO; + if (ret == expected) + ret = init_extent_from_item(ext, &key, &fex); + + } else if (op == SEI_INSERT) { + ret = scoutfs_item_create(sb, &key, &val, lock); + + } else if (op == SEI_DELETE) { + ret = scoutfs_item_delete(sb, &key, lock); + + } else { + ret = WARN_ON_ONCE(-EINVAL); + } + + if (ret == 0 && mirror) { + swap(ext->type, mirror_type); + ret = data_extent_io(sb, op, ext, data); + swap(ext->type, mirror_type); + if (ret) { + err = data_extent_io(sb, mirror_op, ext, data); + BUG_ON(err); + } + } + + return ret; +} + +/* + * Find and remove or mark offline the next extent that intersects with + * the caller's range. The caller is responsible for transactions and + * locks. + * + * Returns: + * - -errno on errors + * - 0 if there are no more extents to stop iteration + * - +iblock of next logical block to truncate the next block from + * + * Since our extents are block granular we can never have > S64_MAX + * iblock values. Returns -ENOENT if no extent was found and -errno on + * errors. + */ +static s64 truncate_one_extent(struct super_block *sb, struct inode *inode, + u64 ino, u64 iblock, u64 last, bool offline, + struct scoutfs_lock *lock) +{ + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_extent next; + struct scoutfs_extent rem; + struct scoutfs_extent fr; + struct scoutfs_extent ofl; + bool rem_fr = false; + bool add_rem = false; + s64 ret; + int err; + + scoutfs_extent_init(&next, SCOUTFS_FILE_EXTENT_TYPE, ino, + iblock, 1, 0, 0); + ret = scoutfs_extent_next(sb, data_extent_io, &next, lock); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } + + trace_scoutfs_data_truncate_next(sb, &next); + + scoutfs_extent_init(&rem, SCOUTFS_FILE_EXTENT_TYPE, ino, + iblock, last - iblock + 1, 0, 0); + if (!scoutfs_extent_intersection(&rem, &next)) { + ret = 0; + goto out; + } + + trace_scoutfs_data_truncate_remove(sb, &rem); + + /* nothing to do if the extent's already offline */ + if (offline && (rem.flags & SEF_OFFLINE)) { + ret = 1; + goto out; + } + + /* free an allocated mapping */ + if (rem.map) { + scoutfs_extent_init(&fr, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, + sbi->node_id, rem.map, rem.len, 0, 0); + ret = scoutfs_extent_add(sb, data_extent_io, &fr, + sbi->node_id_lock); + if (ret) + goto out; + rem_fr = true; + } + + /* remove the mapping */ + ret = scoutfs_extent_remove(sb, data_extent_io, &rem, lock); + if (ret) + goto out; + add_rem = true; + + /* add an offline extent */ + if (offline) { + scoutfs_extent_init(&ofl, SCOUTFS_FILE_EXTENT_TYPE, rem.owner, + rem.start, rem.len, 0, SEF_OFFLINE); + trace_scoutfs_data_truncate_offline(sb, &ofl); + ret = scoutfs_extent_add(sb, data_extent_io, &ofl, lock); + if (ret) + goto out; + } + + scoutfs_inode_add_onoff(inode, rem.map ? -rem.len : 0, + (rem.flags & SEF_OFFLINE ? -rem.len : 0) + + (offline ? ofl.len : 0)); + ret = 1; +out: + if (ret < 0) { + err = 0; + if (add_rem) + err |= scoutfs_extent_add(sb, data_extent_io, &rem, + lock); + if (rem_fr) + err |= scoutfs_extent_remove(sb, data_extent_io, &fr, + sbi->node_id_lock); + BUG_ON(err); /* inconsistency, could save/restore */ + + } else if (ret > 0) { + ret = rem.start + rem.len; + } + + return ret; +} /* * Free blocks inside the logical block range from 'iblock' to 'last', @@ -591,125 +835,37 @@ int scoutfs_data_truncate_items(struct super_block *sb, struct inode *inode, struct scoutfs_lock *lock) { DECLARE_DATA_INFO(sb, datinf); - struct scoutfs_key last_key; - struct scoutfs_key key; - struct block_mapping *map; - struct kvec val; - bool holding = false; - bool dirtied; - u64 blkno; - int bytes; - int ret = 0; - int i; + s64 ret = 0; + + WARN_ON_ONCE(inode && !mutex_is_locked(&inode->i_mutex)); + + /* clamp last to the last possible block? */ + if (last > SCOUTFS_BLOCK_MAX) + last = SCOUTFS_BLOCK_MAX; trace_scoutfs_data_truncate_items(sb, iblock, last, offline); if (WARN_ON_ONCE(last < iblock)) return -EINVAL; - map = kmalloc(sizeof(struct block_mapping), GFP_NOFS); - if (!map) - return -ENOMEM; - - init_mapping_key(&last_key, ino, last); - while (iblock <= last) { - /* find the mapping that could include iblock */ - init_mapping_key(&key, ino, iblock); - kvec_init(&val, map->encoded, sizeof(map->encoded)); - - ret = scoutfs_hold_trans(sb, SIC_TRUNC_BLOCK()); + ret = scoutfs_hold_trans(sb, SIC_TRUNC_EXTENT()); if (ret) break; - holding = true; down_write(&datinf->alloc_rwsem); - - ret = scoutfs_item_next(sb, &key, &last_key, &val, lock); - if (ret < 0) { - if (ret == -ENOENT) - ret = 0; - break; - } - - ret = decode_mapping(map, ret); - if (ret < 0) - break; - - /* set iblock to the first in the next item inside last */ - iblock = max(iblock, le64_to_cpu(key.skm_base) << - SCOUTFS_BLOCK_MAPPING_SHIFT); - - dirtied = false; - for_each_block(i, iblock, last) { - - blkno = map->blknos[i]; - - /* don't need to do anything.. */ - if (!blkno && - !(!offline && test_bit(i, map->offline))) - continue; - - if (!dirtied) { - /* dirty item with full size encoded */ - ret = scoutfs_item_update(sb, &key, &val, lock); - if (ret) - break; - dirtied = true; - } - - /* truncating offline block */ - if (!offline && test_bit(i, map->offline)) { - clear_bit(i, map->offline); - scoutfs_inode_add_offline_blocks(inode, -1); - inode->i_blocks -= SCOUTFS_BLOCK_SECTORS; - } - - /* nothing more to do if unallocated */ - if (!blkno) - continue; - - /* free the allocated block, maybe marking offline */ - ret = set_blkno_free(sb, blkno); - if (ret) - break; - - map->blknos[i] = 0; - scoutfs_inode_add_online_blocks(inode, -1); - inode->i_blocks -= SCOUTFS_BLOCK_SECTORS; - - if (offline) { - set_bit(i, map->offline); - scoutfs_inode_add_offline_blocks(inode, 1); - inode->i_blocks += SCOUTFS_BLOCK_SECTORS; - } - } - - if (dirtied) { - /* update how ever much of the item we finished */ - bytes = encode_mapping(map); - if (bytes) { - kvec_init(&val, map->encoded, bytes); - scoutfs_item_update_dirty(sb, &key, &val); - } else { - scoutfs_item_delete_dirty(sb, &key); - } - } - + ret = truncate_one_extent(sb, inode, ino, iblock, last, + offline, lock); up_write(&datinf->alloc_rwsem); scoutfs_release_trans(sb); - holding = false; - if (ret) + if (ret <= 0) break; + + iblock = ret; + ret = 0; } - if (holding) { - up_write(&datinf->alloc_rwsem); - scoutfs_release_trans(sb); - } - - kfree(map); return ret; } @@ -785,6 +941,8 @@ static struct task_cursor *get_cursor(struct data_info *datinf) static int bulk_alloc(struct super_block *sb) { + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_extent ext; u64 *segnos = NULL; int ret = 0; int i; @@ -796,7 +954,13 @@ static int bulk_alloc(struct super_block *sb) } for (i = 0; segnos[i]; i++) { - ret = set_segno_free(sb, segnos[i]); + scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, + sbi->node_id, + segnos[i] << SCOUTFS_SEGMENT_BLOCK_SHIFT, + SCOUTFS_SEGMENT_BLOCKS, 0, 0); + trace_scoutfs_data_bulk_alloc(sb, &ext); + ret = scoutfs_extent_add(sb, data_extent_io, &ext, + sbi->node_id_lock); if (ret) break; } @@ -810,6 +974,7 @@ out: return ret; } +#if 0 /* * Find the free bit item that contains the blkno and return the next blkno * set starting with this blkno. @@ -883,27 +1048,33 @@ static int find_free_segno(struct super_block *sb, u64 *segno) out: return ret; } +#endif /* * Allocate a single block for the logical block offset in the file. + * The caller tells us if the block was offline or not. We modify the + * extent items and the caller will search for the resulting extent. * * We try to encourage contiguous allocation by having per-task cursors - * that track blocks inside segments. Each new allocating task will get - * a new segment. Lots of concurrent allocations can interleave at - * segment granularity. + * that track large extents. Each new allocating task will get a new + * extent. */ +/* XXX initially tied to segment size, should be a lot larger */ +#define LARGE_EXTENT_BLOCKS SCOUTFS_SEGMENT_BLOCKS static int find_alloc_block(struct super_block *sb, struct inode *inode, - struct block_mapping *map, - struct scoutfs_key *map_key, - unsigned map_ind, bool map_exists, - struct scoutfs_lock *data_lock) + u64 iblock, bool was_offline, + struct scoutfs_lock *lock) { + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); DECLARE_DATA_INFO(sb, datinf); + const u64 ino = scoutfs_ino(inode); + struct scoutfs_extent ext; + struct scoutfs_extent ofl; + struct scoutfs_extent fr; struct task_cursor *curs; - struct kvec val; - int bytes; - u64 segno; - u64 blkno; + bool add_ofl = false; + bool add_fr = false; + int err; int ret; down_write(&datinf->alloc_rwsem); @@ -912,74 +1083,103 @@ static int find_alloc_block(struct super_block *sb, struct inode *inode, trace_scoutfs_data_find_alloc_block_curs(sb, curs, curs->blkno); - /* try to find the next blkno in our cursor if we have one */ + /* see if our cursor is still free */ if (curs->blkno) { - ret = find_free_blkno(sb, curs->blkno, &blkno); - if (ret < 0 && ret != -ENOENT) + /* look for the extent that overlaps our iblock */ + scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, + sbi->node_id, curs->blkno, 1, 0, 0); + ret = scoutfs_extent_next(sb, data_extent_io, &ext, + sbi->node_id_lock); + if (ret && ret != -ENOENT) goto out; - if (ret == 0) { - curs->blkno = blkno; - segno = 0; - } else { + + if (ret == 0) + trace_scoutfs_data_alloc_block_cursor(sb, &ext); + + /* find a new large extent if our cursor isn't free */ + if (ret < 0 || ext.start > curs->blkno) curs->blkno = 0; - } } - /* try to find segnos, asking the server for more */ + /* try to find a new large extent, possibly asking for more */ while (curs->blkno == 0) { - ret = find_free_segno(sb, &segno); - if (ret < 0 && ret != -ENOENT) + scoutfs_extent_init(&ext, SCOUTFS_FREE_EXTENT_BLOCKS_TYPE, + sbi->node_id, 0, 2 * LARGE_EXTENT_BLOCKS, + 0, 0); + ret = scoutfs_extent_next(sb, data_extent_io, &ext, + sbi->node_id_lock); + if (ret && ret != -ENOENT) goto out; + + /* XXX should try to look for smaller free extents :/ */ + + /* + * set our cursor to the aligned start of a large extent + * We'll then remove it and the next aligned free large + * extent will start much later. This stops us from + * constantly setting cursors to the start of a large + * free extent that keeps have its start allocated. + */ if (ret == 0) { - blkno = segno << SCOUTFS_SEGMENT_BLOCK_SHIFT; - curs->blkno = blkno; + trace_scoutfs_data_alloc_block_free(sb, &ext); + curs->blkno = ALIGN(ext.start, LARGE_EXTENT_BLOCKS); break; } + /* try to get allocation from the server if we're out */ ret = bulk_alloc(sb); if (ret < 0) goto out; } - trace_scoutfs_data_find_alloc_block_found_seg(sb, segno, blkno); - /* ensure that we can copy in encoded without failing */ - kvec_init(&val, map->encoded, sizeof(map->encoded)); - if (map_exists) - ret = scoutfs_item_update(sb, map_key, &val, data_lock); - else - ret = scoutfs_item_create(sb, map_key, &val, data_lock); + /* remove the free block we're using */ + scoutfs_extent_init(&fr, SCOUTFS_FREE_EXTENT_BLKNO_TYPE, + sbi->node_id, curs->blkno, 1, 0, 0); + ret = scoutfs_extent_remove(sb, data_extent_io, &fr, sbi->node_id_lock); if (ret) goto out; + add_fr = true; - /* clear the free bit we found */ - if (segno) - ret = clear_segno_free(sb, segno); - else - ret = clear_blkno_free(sb, blkno); - if (ret) - goto out; - - /* update the mapping */ - if (test_and_clear_bit(map_ind, map->offline)) { - scoutfs_inode_add_offline_blocks(inode, -1); - inode->i_blocks -= SCOUTFS_BLOCK_SECTORS; + /* remove an offline file extent */ + if (was_offline) { + scoutfs_extent_init(&ofl, SCOUTFS_FILE_EXTENT_TYPE, ino, + iblock, 1, 0, SEF_OFFLINE); + ret = scoutfs_extent_remove(sb, data_extent_io, &ofl, lock); + if (ret) + goto out; + add_ofl = true; } - map->blknos[map_ind] = blkno; - scoutfs_inode_add_online_blocks(inode, 1); - inode->i_blocks += SCOUTFS_BLOCK_SECTORS; - bytes = encode_mapping(map); - kvec_init(&val, map->encoded, bytes); - scoutfs_item_update_dirty(sb, map_key, &val); + /* add (and hopefully merge!) the new allocation */ + scoutfs_extent_init(&ext, SCOUTFS_FILE_EXTENT_TYPE, ino, + iblock, 1, curs->blkno, 0); + trace_scoutfs_data_alloc_block(sb, &ext); + ret = scoutfs_extent_add(sb, data_extent_io, &ext, lock); + if (ret) + goto out; - /* set cursor to next block, clearing if we finish the segment */ + scoutfs_inode_add_onoff(inode, 1, was_offline ? -1ULL : 0); + + /* set cursor to next block, clearing if we finish a large extent */ + BUILD_BUG_ON(!is_power_of_2(LARGE_EXTENT_BLOCKS)); curs->blkno++; - if ((curs->blkno & SCOUTFS_FREE_BITS_MASK) == 0) + if ((curs->blkno & (LARGE_EXTENT_BLOCKS - 1)) == 0) curs->blkno = 0; ret = 0; out: + if (ret) { + err = 0; + if (add_ofl) + err |= scoutfs_extent_add(sb, data_extent_io, &ofl, + lock); + if (add_fr) + err |= scoutfs_extent_add(sb, data_extent_io, &fr, + sbi->node_id_lock); + BUG_ON(err); /* inconsistency */ + } + up_write(&datinf->alloc_rwsem); trace_scoutfs_data_find_alloc_block_ret(sb, ret); @@ -991,80 +1191,68 @@ static int scoutfs_get_block(struct inode *inode, sector_t iblock, { struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; - struct scoutfs_key key; + struct scoutfs_extent ext; struct scoutfs_lock *lock; - struct block_mapping *map; - struct kvec val; - bool exists; - int ind; + u64 offset; int ret; - int i; + + WARN_ON_ONCE(create && !mutex_is_locked(&inode->i_mutex)); lock = scoutfs_per_task_get(&si->pt_data_lock); if (WARN_ON_ONCE(!lock)) return -EINVAL; - map = kmalloc(sizeof(struct block_mapping), GFP_NOFS); - if (!map) - return -ENOMEM; +restart: + /* look for the extent that overlaps our iblock */ + scoutfs_extent_init(&ext, SCOUTFS_FILE_EXTENT_TYPE, + scoutfs_ino(inode), iblock, 1, 0, 0); + ret = scoutfs_extent_next(sb, data_extent_io, &ext, lock); + if (ret && ret != -ENOENT) + goto out; - init_mapping_key(&key, scoutfs_ino(inode), iblock); - kvec_init(&val, map->encoded, sizeof(map->encoded)); + if (ret == 0) + trace_scoutfs_data_get_block_next(sb, &ext); - /* find the mapping item that covers the logical block */ - ret = scoutfs_item_lookup(sb, &key, &val, lock); - if (ret < 0) { - if (ret != -ENOENT) - goto out; - memset(map->blknos, 0, sizeof(map->blknos)); - memset(map->offline, 0, sizeof(map->offline)); - exists = false; - } else { - ret = decode_mapping(map, ret); - if (ret < 0) - goto out; - exists = true; - } + /* didn't find an extent or it's past our iblock */ + if (ret == -ENOENT || ext.start > iblock) + memset(&ext, 0, sizeof(ext)); - ind = iblock & SCOUTFS_BLOCK_MAPPING_MASK; + if (ext.len) + trace_scoutfs_data_get_block_intersection(sb, &ext); /* fail read and write if it's offline and we're not staging */ - if (test_bit(ind, map->offline) && !si->staging) { + if ((ext.flags & SEF_OFFLINE) && !si->staging) { ret = -EINVAL; goto out; } /* try to allocate if we're writing */ - if (create && !map->blknos[ind]) { + if (create && !ext.map) { /* * XXX can blow the transaction here.. need to back off * and try again if we've already done a bulk alloc in * our transaction. */ - ret = find_alloc_block(sb, inode, map, &key, ind, exists, lock); + ret = find_alloc_block(sb, inode, iblock, + ext.flags & SEF_OFFLINE, lock); if (ret) goto out; set_buffer_new(bh); + /* restart the search now that it's been allocated */ + goto restart; } - /* mark the bh mapped and set the size for as many contig as we see */ - if (map->blknos[ind]) { - for (i = 1; ind + i < SCOUTFS_BLOCK_MAPPING_BLOCKS; i++) { - if (map->blknos[ind + i] != map->blknos[ind] + i) - break; - } - - map_bh(bh, inode->i_sb, map->blknos[ind]); - bh->b_size = min_t(u64, bh->b_size, i << SCOUTFS_BLOCK_SHIFT); + /* map the bh and set the size to as much of the extent as we can */ + if (ext.map) { + offset = iblock - ext.start; + map_bh(bh, inode->i_sb, ext.map + offset); + bh->b_size = min_t(u64, bh->b_size, + (ext.len - offset) << SCOUTFS_BLOCK_SHIFT); } - ret = 0; out: trace_scoutfs_get_block(sb, scoutfs_ino(inode), iblock, create, ret, bh->b_blocknr, bh->b_size); - - kfree(map); - return ret; } @@ -1231,6 +1419,7 @@ struct pending_fiemap { u32 flags; }; +#if 0 /* * The caller is iterating over mapped blocks. We merge the current * pending fiemap entry with the next block if we can. If we can't @@ -1276,43 +1465,31 @@ static int merge_or_fill(struct fiemap_extent_info *fieinfo, return 0; } +#endif /* - * Iterate over non-zero block mapping items merging contiguous blocks and - * filling extent entries as we cross non-contiguous boundaries. We set - * _LAST on the last extent and _UNKNOWN on offline extents. + * Return all the file's extents whose blocks overlap with the caller's + * byte region. We set _LAST on the last extent and _UNKNOWN on offline + * extents. */ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { struct super_block *sb = inode->i_sb; - const u64 ino = scoutfs_ino(inode); - struct scoutfs_key last_key; - struct scoutfs_key key; struct scoutfs_lock *inode_lock = NULL; - struct block_mapping *map; - struct pending_fiemap pend; - struct kvec val; + struct scoutfs_extent ext; loff_t i_size; - bool offline; u64 blk_off; - u64 final; - u64 logical; - u64 phys; + u64 logical = 0; + u64 phys = 0; + u64 size = 0; + u32 flags = 0; int ret; - int i; ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); if (ret) return ret; - map = kmalloc(sizeof(struct block_mapping), GFP_NOFS); - if (!map) - return -ENOMEM; - - /* initialize to impossible to merge */ - memset(&pend, 0, sizeof(pend)); - /* XXX overkill? */ mutex_lock(&inode->i_mutex); @@ -1323,68 +1500,46 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, goto out; } - blk_off = start >> SCOUTFS_BLOCK_SHIFT; - final = min_t(loff_t, i_size - 1, start + len - 1) >> - SCOUTFS_BLOCK_SHIFT; - init_mapping_key(&last_key, ino, final); - ret = scoutfs_lock_inode(sb, DLM_LOCK_PR, 0, inode, &inode_lock); if (ret) goto out; - while (blk_off <= final) { - init_mapping_key(&key, ino, blk_off); - kvec_init(&val, &map->encoded, sizeof(map->encoded)); + blk_off = start >> SCOUTFS_BLOCK_SHIFT; - ret = scoutfs_item_next(sb, &key, &last_key, &val, inode_lock); - if (ret < 0) { + for (;;) { + scoutfs_extent_init(&ext, SCOUTFS_FILE_EXTENT_TYPE, + scoutfs_ino(inode), blk_off, 1, 0, 0); + ret = scoutfs_extent_next(sb, data_extent_io, &ext, inode_lock); + /* fiemap will return last and stop when we see enoent */ + if (ret < 0 && ret != -ENOENT) + break; + + if (ret == 0) + trace_scoutfs_data_fiemap_extent(sb, &ext); + + if (size) { if (ret == -ENOENT) - ret = 0; - break; - } - - ret = decode_mapping(map, ret); - if (ret < 0) - break; - - /* set blk_off to the first in the next item inside last */ - blk_off = max(blk_off, le64_to_cpu(key.skm_base) << - SCOUTFS_BLOCK_MAPPING_SHIFT); - - for_each_block(i, blk_off, final) { - offline = !!test_bit(i, map->offline); - - /* nothing to do with sparse regions */ - if (map->blknos[i] == 0 && !offline) - continue; - - trace_scoutfs_data_fiemap(sb, blk_off, i, - map->blknos[i]); - - logical = blk_off << SCOUTFS_BLOCK_SHIFT; - phys = map->blknos[i] << SCOUTFS_BLOCK_SHIFT; - - ret = merge_or_fill(fieinfo, &pend, logical, phys, - offline, false); - if (ret != 0) + flags |= FIEMAP_EXTENT_LAST; + ret = fiemap_fill_next_extent(fieinfo, logical, phys, + size, flags); + if (ret || (logical + size >= (start + len))) { + if (ret == 1) + ret = 0; break; + } } - if (ret != 0) - break; + + logical = ext.start << SCOUTFS_BLOCK_SHIFT; + phys = ext.map << SCOUTFS_BLOCK_SHIFT; + size = ext.len << SCOUTFS_BLOCK_SHIFT; + flags = (ext.flags & SEF_OFFLINE) ? FIEMAP_EXTENT_UNKNOWN : 0; + + blk_off = ext.start + ext.len; } scoutfs_unlock(sb, inode_lock, DLM_LOCK_PR); - - if (ret == 0) { - /* catch final last fill */ - ret = merge_or_fill(fieinfo, &pend, 0, 0, false, true); - } - if (ret == 1) - ret = 0; - out: mutex_unlock(&inode->i_mutex); - kfree(map); return ret; } @@ -1460,6 +1615,7 @@ void scoutfs_data_destroy(struct super_block *sb) } } +#if 0 /* * Basic correctness tests of u64 and mapping encoding. */ @@ -1576,3 +1732,4 @@ out: return ret; } +#endif diff --git a/kmod/src/data.h b/kmod/src/data.h index e8157065..c9214c5c 100644 --- a/kmod/src/data.h +++ b/kmod/src/data.h @@ -13,6 +13,4 @@ int scoutfs_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int scoutfs_data_setup(struct super_block *sb); void scoutfs_data_destroy(struct super_block *sb); -int __init scoutfs_data_test(void); - #endif diff --git a/kmod/src/format.h b/kmod/src/format.h index 7735440b..766aad25 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -83,6 +83,11 @@ struct scoutfs_key { #define skf_node_id _sk_first #define skf_base _sk_second +/* node free extent */ +#define sknf_node_id _sk_first +#define sknf_major _sk_second +#define sknf_minor _sk_third + /* node orphan inode */ #define sko_node_id _sk_first #define sko_ino _sk_second @@ -109,6 +114,10 @@ struct scoutfs_key { #define skm_ino _sk_first #define skm_base _sk_second +/* file extent */ +#define skfe_ino _sk_first +#define skfe_last _sk_second + /* * The btree still uses memcmp() to compare keys. We should fix that * before too long. @@ -305,6 +314,8 @@ struct scoutfs_segment_block { /* node zone */ #define SCOUTFS_FREE_BITS_SEGNO_TYPE 1 #define SCOUTFS_FREE_BITS_BLKNO_TYPE 2 +#define SCOUTFS_FREE_EXTENT_BLKNO_TYPE 3 +#define SCOUTFS_FREE_EXTENT_BLOCKS_TYPE 4 /* fs zone */ #define SCOUTFS_INODE_TYPE 1 @@ -315,6 +326,7 @@ struct scoutfs_segment_block { #define SCOUTFS_SYMLINK_TYPE 6 #define SCOUTFS_BLOCK_MAPPING_TYPE 7 #define SCOUTFS_ORPHAN_TYPE 8 +#define SCOUTFS_FILE_EXTENT_TYPE 9 #define SCOUTFS_MAX_TYPE 16 /* power of 2 is efficient */ @@ -367,6 +379,18 @@ struct scoutfs_free_bits { __le64 bits[SCOUTFS_FREE_BITS_U64S]; } __packed; +/* + * File extents have more data than easily fits in the key so we move + * the non-indexed fields into the value. + */ +struct scoutfs_file_extent { + __le64 blkno; + __le64 len; + __u8 flags; +} __packed; + +#define SEF_OFFLINE 0x1 + /* * The first xattr part item has a header that describes the xattr. The * name and value are then packed into the following bytes in the first @@ -510,7 +534,6 @@ enum { SCOUTFS_DT_WHT, }; -#define SCOUTFS_MAX_VAL_SIZE SCOUTFS_BLOCK_MAPPING_MAX_BYTES #define SCOUTFS_XATTR_MAX_NAME_LEN 255 #define SCOUTFS_XATTR_MAX_VAL_LEN 65535 @@ -520,6 +543,8 @@ enum { DIV_ROUND_UP(sizeof(struct scoutfs_xattr) + name_len + val_len, \ SCOUTFS_XATTR_MAX_PART_SIZE); +#define SCOUTFS_MAX_VAL_SIZE SCOUTFS_XATTR_MAX_PART_SIZE + /* * structures used by dlm */ diff --git a/kmod/src/key.c b/kmod/src/key.c index 23aa8265..6b34b432 100644 --- a/kmod/src/key.c +++ b/kmod/src/key.c @@ -28,6 +28,8 @@ char *scoutfs_type_strings[SCOUTFS_MAX_ZONE][SCOUTFS_MAX_TYPE] = { [SCOUTFS_INODE_INDEX_ZONE][SCOUTFS_INODE_INDEX_DATA_SEQ_TYPE] = "dsq", [SCOUTFS_NODE_ZONE][SCOUTFS_FREE_BITS_SEGNO_TYPE] = "fsg", [SCOUTFS_NODE_ZONE][SCOUTFS_FREE_BITS_BLKNO_TYPE] = "fbk", + [SCOUTFS_NODE_ZONE][SCOUTFS_FREE_EXTENT_BLKNO_TYPE] = "fbn", + [SCOUTFS_NODE_ZONE][SCOUTFS_FREE_EXTENT_BLOCKS_TYPE] = "fbs", [SCOUTFS_NODE_ZONE][SCOUTFS_ORPHAN_TYPE] = "orp", [SCOUTFS_FS_ZONE][SCOUTFS_INODE_TYPE] = "ino", [SCOUTFS_FS_ZONE][SCOUTFS_XATTR_TYPE] = "xat", @@ -36,6 +38,7 @@ char *scoutfs_type_strings[SCOUTFS_MAX_ZONE][SCOUTFS_MAX_TYPE] = { [SCOUTFS_FS_ZONE][SCOUTFS_LINK_BACKREF_TYPE] = "lbr", [SCOUTFS_FS_ZONE][SCOUTFS_SYMLINK_TYPE] = "sym", [SCOUTFS_FS_ZONE][SCOUTFS_BLOCK_MAPPING_TYPE] = "bmp", + [SCOUTFS_FS_ZONE][SCOUTFS_FILE_EXTENT_TYPE] = "fex", }; char scoutfs_unknown_u8_strings[U8_MAX][U8_STR_MAX]; diff --git a/kmod/src/scoutfs_trace.h b/kmod/src/scoutfs_trace.h index 2ab9c3f3..c42b6c00 100644 --- a/kmod/src/scoutfs_trace.h +++ b/kmod/src/scoutfs_trace.h @@ -2131,6 +2131,47 @@ DEFINE_EVENT(scoutfs_extent_class, scoutfs_extent_remove, TP_ARGS(sb, ext) ); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_truncate_next, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_truncate_remove, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_truncate_offline, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_bulk_alloc, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_alloc_block_cursor, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_alloc_block_free, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_alloc_block, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_get_block_next, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_get_block_intersection, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); +DEFINE_EVENT(scoutfs_extent_class, scoutfs_data_fiemap_extent, + TP_PROTO(struct super_block *sb, struct scoutfs_extent *ext), + TP_ARGS(sb, ext) +); + #endif /* _TRACE_SCOUTFS_H */ /* This part must be outside protection */ diff --git a/kmod/src/super.c b/kmod/src/super.c index fdfd4080..915e33fe 100644 --- a/kmod/src/super.c +++ b/kmod/src/super.c @@ -424,10 +424,6 @@ static int __init scoutfs_module_init(void) scoutfs_key_init(); scoutfs_init_counters(); - ret = scoutfs_data_test(); - if (ret) - return ret; - ret = scoutfs_sysfs_init(); if (ret) return ret;