From fc50072cf97537d3cb9dd4c3ef8de1397c74fa38 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 29 Jun 2017 14:22:10 -0700 Subject: [PATCH] scoutfs: store manifest entries in the btree Convert the manifest to store entries in persistent btree keys and values instead of using the rbtree in memory from the ring. The btree doesn't have a sort function. It just compares variable length keys. The most complicated part of this transformation is dealing with the fallout of this. The compare function can't compare different search keys and item keys so searches need to construct full synthetic btree keys to search. It also can't return different comparisons, like overlaping, so the caller needs to do a bit more work to use key comparisons to find overlapping segments. And it can't compare differently depending on the level of the manifest so we store the manifest in keys differently depending on whether its in level 0 or not. All mount clients can now see the manifest blocks. They can query the manifest directly when trying to find segments to read. We can get rid of all the networking calls that were finding the segments for readers. We change the manifest functions that relied on the ring that the to make changes in the manifest persistent. We don't touch the allocator or the rest of the manifest server, though, so this commit breaks the world. It'll be restored in future patches as we update the segment allocator and server to work with the btree. Signed-off-by: Zach Brown --- kmod/src/btree.c | 2 +- kmod/src/compact.c | 32 +- kmod/src/compact.h | 4 +- kmod/src/format.h | 39 ++- kmod/src/manifest.c | 818 ++++++++++++++++++++------------------------ kmod/src/manifest.h | 54 ++- kmod/src/net.c | 283 +++++---------- kmod/src/net.h | 4 - kmod/src/ring.c | 6 +- kmod/src/seg.c | 41 +-- kmod/src/seg.h | 11 +- 11 files changed, 548 insertions(+), 746 deletions(-) diff --git a/kmod/src/btree.c b/kmod/src/btree.c index e30f5af0..55a2a0a8 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -1768,7 +1768,7 @@ int scoutfs_btree_write_dirty(struct super_block *sb) struct scoutfs_super_block *super = &sbi->super; struct scoutfs_btree_ring *bring = &super->bring; struct scoutfs_btree_root *roots[] = { - /* XXX super roots go here */ + &super->manifest.root, NULL, }; struct scoutfs_btree_root *root; diff --git a/kmod/src/compact.c b/kmod/src/compact.c index 56b248a8..12f8d902 100644 --- a/kmod/src/compact.c +++ b/kmod/src/compact.c @@ -457,15 +457,13 @@ void scoutfs_compact_describe(struct super_block *sb, void *data, * and is then possibly adding all the lower overlapping segments. */ int scoutfs_compact_add(struct super_block *sb, void *data, - struct scoutfs_key_buf *first, - struct scoutfs_key_buf *last, u64 segno, u64 seq, - u8 level) + struct scoutfs_manifest_entry *ment) { struct compact_cursor *curs = data; struct compact_seg *cseg; int ret; - cseg = alloc_cseg(sb, first, last); + cseg = alloc_cseg(sb, &ment->first, &ment->last); if (!cseg) { ret = -ENOMEM; goto out; @@ -473,9 +471,9 @@ int scoutfs_compact_add(struct super_block *sb, void *data, list_add_tail(&cseg->entry, &curs->csegs); - cseg->segno = segno; - cseg->seq = seq; - cseg->level = level; + cseg->segno = ment->segno; + cseg->seq = ment->seq; + cseg->level = ment->level; if (!curs->upper) curs->upper = cseg; @@ -501,8 +499,8 @@ void scoutfs_compact_add_segno(struct super_block *sb, void *data, u64 segno) /* * Commit the result of a compaction based on the state of the cursor. - * The net caller stops the rings from being written while we're making - * changes. We lock the manifest to atomically make our changes. + * The net caller stops the manifest from being written while we're + * making changes. We lock the manifest to atomically make our changes. * * The erorr handling is sketchy here because calling the manifest from * here is temporary. We should be sending a message to the server @@ -510,6 +508,7 @@ void scoutfs_compact_add_segno(struct super_block *sb, void *data, u64 segno) */ int scoutfs_compact_commit(struct super_block *sb, void *c, void *r) { + struct scoutfs_manifest_entry ment; struct compact_cursor *curs = c; struct list_head *results = r; struct compact_seg *cseg; @@ -533,8 +532,9 @@ int scoutfs_compact_commit(struct super_block *sb, void *c, void *r) BUG_ON(ret); } - ret = scoutfs_manifest_del(sb, cseg->first, - cseg->seq, cseg->level); + scoutfs_manifest_init_entry(&ment, cseg->level, 0, cseg->seq, + cseg->first, NULL); + ret = scoutfs_manifest_del(sb, &ment); BUG_ON(ret); } @@ -542,12 +542,12 @@ int scoutfs_compact_commit(struct super_block *sb, void *c, void *r) list_for_each_entry(cseg, results, entry) { /* XXX moved upper segments won't have read the segment :P */ if (cseg->seg) - ret = scoutfs_seg_manifest_add(sb, cseg->seg, - cseg->level); + scoutfs_seg_init_ment(&ment, cseg->level, cseg->seg); else - ret = scoutfs_manifest_add(sb, cseg->first, - cseg->last, cseg->segno, - cseg->seq, cseg->level); + scoutfs_manifest_init_entry(&ment, cseg->level, + cseg->segno, cseg->seq, + cseg->first, cseg->last); + ret = scoutfs_manifest_add(sb, &ment); BUG_ON(ret); } diff --git a/kmod/src/compact.h b/kmod/src/compact.h index f6f4bb60..c163ce56 100644 --- a/kmod/src/compact.h +++ b/kmod/src/compact.h @@ -6,9 +6,7 @@ void scoutfs_compact_kick(struct super_block *sb); void scoutfs_compact_describe(struct super_block *sb, void *data, u8 upper_level, u8 last_level, bool sticky); int scoutfs_compact_add(struct super_block *sb, void *data, - struct scoutfs_key_buf *first, - struct scoutfs_key_buf *last, u64 segno, u64 seq, - u8 level); + struct scoutfs_manifest_entry *ment); void scoutfs_compact_add_segno(struct super_block *sb, void *data, u64 segno); int scoutfs_compact_commit(struct super_block *sb, void *c, void *r); diff --git a/kmod/src/format.h b/kmod/src/format.h index 8a7a2df5..7ca2f2e3 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -169,16 +169,38 @@ struct scoutfs_btree_ring { #define SCOUTFS_MANIFEST_FANOUT 10 struct scoutfs_manifest { - struct scoutfs_ring_descriptor ring; + struct scoutfs_btree_root root; __le64 level_counts[SCOUTFS_MANIFEST_MAX_LEVEL]; } __packed; -struct scoutfs_manifest_entry { +/* + * Manifest entries are packed into btree keys and values in a very + * fiddly way so that we can sort them with memcmp first by level then + * by their position in the level. First comes the level. + * + * Level 0 segments are sorted by their seq so they don't have the first + * segment key in the manifest btree key. Both of their keys are in the + * value. + * + * Level 1 segments are sorted by their first key so their last key is + * in the value. + * + * We go to all this trouble so that we can communicate a version of the + * manifest with one btree root, have dense btree keys which are used as + * seperators in parent blocks, and don't duplicate the large keys in + * the manifest btree key and value. + */ + +struct scoutfs_manifest_btree_key { + __u8 level; + __u8 bkey[0]; +} __packed; + +struct scoutfs_manifest_btree_val { __le64 segno; __le64 seq; __le16 first_key_len; __le16 last_key_len; - __u8 level; __u8 keys[0]; } __packed; @@ -536,9 +558,13 @@ struct scoutfs_net_key_range { __u8 key_bytes[0]; } __packed; -struct scoutfs_net_manifest_entries { - __le16 nr; - struct scoutfs_manifest_entry ments[0]; +struct scoutfs_net_manifest_entry { + __le64 segno; + __le64 seq; + __le16 first_key_len; + __le16 last_key_len; + __u8 level; + __u8 keys[0]; } __packed; /* XXX I dunno, totally made up */ @@ -561,7 +587,6 @@ struct scoutfs_net_segnos { enum { SCOUTFS_NET_ALLOC_INODES = 0, - SCOUTFS_NET_MANIFEST_RANGE_ENTRIES, SCOUTFS_NET_ALLOC_SEGNO, SCOUTFS_NET_RECORD_SEGMENT, SCOUTFS_NET_BULK_ALLOC, diff --git a/kmod/src/manifest.c b/kmod/src/manifest.c index f757c1b9..31c55283 100644 --- a/kmod/src/manifest.c +++ b/kmod/src/manifest.c @@ -20,7 +20,7 @@ #include "kvec.h" #include "seg.h" #include "item.h" -#include "ring.h" +#include "btree.h" #include "cmp.h" #include "compact.h" #include "manifest.h" @@ -30,16 +30,17 @@ #include "scoutfs_trace.h" /* - * Manifest entries are stored in ring nodes. + * Manifest entries are stored in the cow btrees in the persistently + * allocated ring of blocks in the shared device. This lets clients + * read consistent old versions of the manifest when it's safe to do so. * - * They're sorted first by level then by their first key. This enables - * the primary searches based on key value for looking up items in - * segments via the manifest. + * Manifest entries are sorted first by level then by their first key. + * This enables the primary searches based on key value for looking up + * items in segments via the manifest. */ struct manifest { struct rw_semaphore rwsem; - struct scoutfs_ring_info ring; u8 nr_levels; /* calculated on mount, const thereafter */ @@ -78,41 +79,6 @@ struct manifest_ref { struct scoutfs_key_buf *last; }; -/* - * Seq is only specified for operations that differentiate between - * segments with identical items by their sequence number. - */ -struct manifest_search_key { - u64 seq; - struct scoutfs_key_buf *key; - u8 level; -}; - -static void init_ment_keys(struct scoutfs_manifest_entry *ment, - struct scoutfs_key_buf *first, - struct scoutfs_key_buf *last) -{ - if (first) - scoutfs_key_init(first, ment->keys, - le16_to_cpu(ment->first_key_len)); - if (last) - scoutfs_key_init(last, ment->keys + - le16_to_cpu(ment->first_key_len), - le16_to_cpu(ment->last_key_len)); -} - -static bool cmp_range_ment(struct scoutfs_key_buf *key, - struct scoutfs_key_buf *end, - struct scoutfs_manifest_entry *ment) -{ - struct scoutfs_key_buf first; - struct scoutfs_key_buf last; - - init_ment_keys(ment, &first, &last); - - return scoutfs_key_compare_ranges(key, end, &first, &last); -} - /* * Change the level count under the manifest lock. We then maintain a * bit that can be tested outside the lock to determine if the caller @@ -153,6 +119,152 @@ bool scoutfs_manifest_level0_full(struct super_block *sb) return test_bit(MANI_FLAG_LEVEL0_FULL, &mani->flags); } +void scoutfs_manifest_init_entry(struct scoutfs_manifest_entry *ment, + u64 level, u64 segno, u64 seq, + struct scoutfs_key_buf *first, + struct scoutfs_key_buf *last) +{ + ment->level = level; + ment->segno = segno; + ment->seq = seq; + + if (first) + scoutfs_key_clone(&ment->first, first); + else + scoutfs_key_init(&ment->first, NULL, 0); + + if (last) + scoutfs_key_clone(&ment->last, last); + else + scoutfs_key_init(&ment->last, NULL, 0); +} + +/* + * level 0 segments have the extra seq up in the btree key. + */ +static struct scoutfs_manifest_btree_key * +alloc_btree_key_val_lens(unsigned first_len, unsigned last_len) +{ + return kmalloc(sizeof(struct scoutfs_manifest_btree_key) + + sizeof(u64) + + sizeof(struct scoutfs_manifest_btree_val) + + first_len + last_len, GFP_NOFS); +} + +/* + * Initialize the btree key and value for a manifest entry in one contiguous + * allocation. + */ +static struct scoutfs_manifest_btree_key * +alloc_btree_key_val(struct scoutfs_manifest_entry *ment, unsigned *mkey_len, + struct scoutfs_manifest_btree_val **mval_ret, + unsigned *mval_len_ret) +{ + struct scoutfs_manifest_btree_key *mkey; + struct scoutfs_manifest_btree_val *mval; + struct scoutfs_key_buf b_first; + struct scoutfs_key_buf b_last; + unsigned bkey_len; + unsigned mval_len; + __be64 seq; + + mkey = alloc_btree_key_val_lens(ment->first.key_len, ment->last.key_len); + if (!mkey) + return NULL; + + if (ment->level == 0) { + seq = cpu_to_be64(ment->seq); + bkey_len = sizeof(seq); + memcpy(mkey->bkey, &seq, bkey_len); + } else { + bkey_len = ment->first.key_len; + } + + *mkey_len = offsetof(struct scoutfs_manifest_btree_key, bkey[bkey_len]); + mval = (void *)mkey + *mkey_len; + + if (ment->level == 0) { + scoutfs_key_init(&b_first, mval->keys, ment->first.key_len); + scoutfs_key_init(&b_last, mval->keys + ment->first.key_len, + ment->last.key_len); + mval_len = sizeof(struct scoutfs_manifest_btree_val) + + ment->first.key_len + ment->last.key_len; + } else { + scoutfs_key_init(&b_first, mkey->bkey, ment->first.key_len); + scoutfs_key_init(&b_last, mval->keys, ment->last.key_len); + mval_len = sizeof(struct scoutfs_manifest_btree_val) + + ment->last.key_len; + } + + mkey->level = ment->level; + mval->segno = cpu_to_le64(ment->segno); + mval->seq = cpu_to_le64(ment->seq); + mval->first_key_len = cpu_to_le16(ment->first.key_len); + mval->last_key_len = cpu_to_le16(ment->last.key_len); + + scoutfs_key_copy(&b_first, &ment->first); + scoutfs_key_copy(&b_last, &ment->last); + + if (mval_ret) { + *mval_ret = mval; + *mval_len_ret = mval_len; + } + return mkey; +} + +/* initialize a native manifest entry to point to the btree key and value */ +static void init_ment_iref(struct scoutfs_manifest_entry *ment, + struct scoutfs_btree_item_ref *iref) +{ + struct scoutfs_manifest_btree_key *mkey = iref->key; + struct scoutfs_manifest_btree_val *mval = iref->val; + + ment->level = mkey->level; + ment->segno = le64_to_cpu(mval->segno); + ment->seq = le64_to_cpu(mval->seq); + + if (ment->level == 0) { + scoutfs_key_init(&ment->first, mval->keys, + le16_to_cpu(mval->first_key_len)); + scoutfs_key_init(&ment->last, mval->keys + + le16_to_cpu(mval->first_key_len), + le16_to_cpu(mval->last_key_len)); + } else { + scoutfs_key_init(&ment->first, mkey->bkey, + le16_to_cpu(mval->first_key_len)); + scoutfs_key_init(&ment->last, mval->keys, + le16_to_cpu(mval->last_key_len)); + } +} + +/* + * Fill the callers max-size btree key with the given values and return + * its length. + */ +static unsigned init_btree_key(struct scoutfs_manifest_btree_key *mkey, + u8 level, u64 seq, struct scoutfs_key_buf *first) +{ + struct scoutfs_key_buf b_first; + unsigned bkey_len; + __be64 bseq; + + mkey->level = level; + + if (level == 0) { + bseq = cpu_to_be64(seq); + bkey_len = sizeof(bseq); + memcpy(mkey->bkey, &bseq, bkey_len); + } else if (first) { + scoutfs_key_init(&b_first, mkey->bkey, first->key_len); + scoutfs_key_copy(&b_first, first); + bkey_len = first->key_len; + } else { + bkey_len = 0; + } + + return offsetof(struct scoutfs_manifest_btree_key, bkey[bkey_len]); +} + /* * Insert a new manifest entry in the ring. The ring allocates a new * node for us and we fill it. @@ -160,180 +272,68 @@ bool scoutfs_manifest_level0_full(struct super_block *sb) * This must be called with the manifest lock held. */ int scoutfs_manifest_add(struct super_block *sb, - struct scoutfs_key_buf *first, - struct scoutfs_key_buf *last, u64 segno, u64 seq, - u8 level) + struct scoutfs_manifest_entry *ment) { DECLARE_MANIFEST(sb, mani); - struct scoutfs_manifest_entry *ment; - struct scoutfs_key_buf ment_first; - struct scoutfs_key_buf ment_last; - struct manifest_search_key skey; - unsigned key_bytes; - unsigned bytes; - - trace_scoutfs_manifest_add(sb, level, segno, seq, first, last); - - key_bytes = first->key_len + last->key_len; - bytes = offsetof(struct scoutfs_manifest_entry, keys[key_bytes]); - - skey.key = first; - skey.level = level; - skey.seq = seq; - - ment = scoutfs_ring_insert(&mani->ring, &skey, bytes); - if (!ment) - return -ENOMEM; - - ment->segno = cpu_to_le64(segno); - ment->seq = cpu_to_le64(seq); - ment->first_key_len = cpu_to_le16(first->key_len); - ment->last_key_len = cpu_to_le16(last->key_len); - ment->level = level; - - init_ment_keys(ment, &ment_first, &ment_last); - scoutfs_key_copy(&ment_first, first); - scoutfs_key_copy(&ment_last, last); - - mani->nr_levels = max_t(u8, mani->nr_levels, level + 1); - add_level_count(sb, level, 1); - return 0; -} - -/* - * Add a manifest entry as provided by the caller instead of exploded - * out into arguments. - * - * This must be called with the manifest lock held. - */ -int scoutfs_manifest_add_ment(struct super_block *sb, - struct scoutfs_manifest_entry *add) -{ - DECLARE_MANIFEST(sb, mani); - struct scoutfs_manifest_entry *ment; - struct manifest_search_key skey; - struct scoutfs_key_buf first; - struct scoutfs_key_buf last; - unsigned bytes; + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + struct scoutfs_manifest_btree_key *mkey; + struct scoutfs_manifest_btree_val *mval; + unsigned mkey_len; + unsigned mval_len; + int ret; lockdep_assert_held(&mani->rwsem); - init_ment_keys(add, &first, &last); - trace_scoutfs_manifest_add(sb, add->level, le64_to_cpu(add->segno), - le64_to_cpu(add->seq), &first, &last); - - skey.key = &first; - skey.level = add->level; - skey.seq = le64_to_cpu(add->seq); - - bytes = scoutfs_manifest_bytes(add); - - ment = scoutfs_ring_insert(&mani->ring, &skey, bytes); - if (!ment) + mkey = alloc_btree_key_val(ment, &mkey_len, &mval, &mval_len); + if (!mkey) return -ENOMEM; - memcpy(ment, add, bytes); + trace_scoutfs_manifest_add(sb, ment->level, ment->segno, ment->seq, + &ment->first, &ment->last); - mani->nr_levels = max_t(u8, mani->nr_levels, add->level + 1); - add_level_count(sb, add->level, 1); + ret = scoutfs_btree_insert(sb, &super->manifest.root, mkey, mkey_len, + mval, mval_len); + if (ret == 0) { + mani->nr_levels = max_t(u8, mani->nr_levels, ment->level + 1); + add_level_count(sb, ment->level, 1); + } - return 0; + kfree(mkey); + return ret; } /* * This must be called with the manifest lock held. + * + * When this is called from the network we can take the keys directly as + * they were sent from the clients. */ -int scoutfs_manifest_dirty(struct super_block *sb, - struct scoutfs_key_buf *first, u64 seq, u8 level) +int scoutfs_manifest_del(struct super_block *sb, + struct scoutfs_manifest_entry *ment) { DECLARE_MANIFEST(sb, mani); - struct scoutfs_manifest_entry *ment; - struct manifest_search_key skey; + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + struct scoutfs_manifest_btree_key *mkey; + unsigned mkey_len; + int ret; - skey.key = first; - skey.level = level; - skey.seq = seq; + trace_scoutfs_manifest_delete(sb, ment->level, ment->segno, ment->seq, + &ment->first, &ment->last); - ment = scoutfs_ring_lookup(&mani->ring, &skey); - if (!ment) - return -ENOENT; + lockdep_assert_held(&mani->rwsem); - scoutfs_ring_dirty(&mani->ring, ment); - return 0; -} + mkey = alloc_btree_key_val(ment, &mkey_len, NULL, NULL); + if (!mkey) + return -ENOMEM; -/* - * This must be called with the manifest lock held. - */ -int scoutfs_manifest_del(struct super_block *sb, struct scoutfs_key_buf *first, - u64 seq, u8 level) -{ - DECLARE_MANIFEST(sb, mani); - struct scoutfs_manifest_entry *ment; - struct manifest_search_key skey; - struct scoutfs_key_buf last; + ret = scoutfs_btree_delete(sb, &super->manifest.root, mkey, mkey_len); + if (ret == 0) + add_level_count(sb, ment->level, -1ULL); - skey.key = first; - skey.level = level; - skey.seq = seq; - - ment = scoutfs_ring_lookup(&mani->ring, &skey); - if (!ment) - return -ENOENT; - - init_ment_keys(ment, NULL, &last); - trace_scoutfs_manifest_delete(sb, ment->level, le64_to_cpu(ment->segno), - le64_to_cpu(ment->seq), first, &last); - - scoutfs_ring_delete(&mani->ring, ment); - add_level_count(sb, level, -1ULL); - return 0; -} - -/* - * Return the total number of bytes used by the given manifest entry, - * including its struct. - */ -int scoutfs_manifest_bytes(struct scoutfs_manifest_entry *ment) -{ - return sizeof(struct scoutfs_manifest_entry) + - le16_to_cpu(ment->first_key_len) + - le16_to_cpu(ment->last_key_len); -} - -/* - * Return an allocated and filled in manifest entry. - */ -struct scoutfs_manifest_entry * -scoutfs_manifest_alloc_entry(struct super_block *sb, - struct scoutfs_key_buf *first, - struct scoutfs_key_buf *last, u64 segno, u64 seq, - u8 level) -{ - struct scoutfs_manifest_entry *ment; - struct scoutfs_key_buf ment_first; - struct scoutfs_key_buf ment_last; - unsigned key_bytes; - unsigned bytes; - - key_bytes = first->key_len + last->key_len; - bytes = offsetof(struct scoutfs_manifest_entry, keys[key_bytes]); - - ment = kmalloc(bytes, GFP_NOFS); - if (!ment) - return NULL; - - ment->segno = cpu_to_le64(segno); - ment->seq = cpu_to_le64(seq); - ment->first_key_len = cpu_to_le16(first->key_len); - ment->last_key_len = cpu_to_le16(last->key_len); - ment->level = level; - - init_ment_keys(ment, &ment_first, &ment_last); - scoutfs_key_copy(&ment_first, first); - scoutfs_key_copy(&ment_last, last); - - return ment; + kfree(mkey); + return ret; } /* @@ -372,50 +372,70 @@ static void free_ref(struct super_block *sb, struct manifest_ref *ref) } /* - * Allocate a native manifest ref so that we can work with segments described - * by the callers manifest entry. - (* - * This frees all the elements on the list if it returns an error. + * Allocate a reading manifest ref so that we can work with segments + * described by the callers manifest entry. */ -int scoutfs_manifest_add_ment_ref(struct super_block *sb, - struct list_head *list, - struct scoutfs_manifest_entry *ment) +static int alloc_manifest_ref(struct super_block *sb, struct list_head *ref_list, + struct scoutfs_manifest_entry *ment) { - struct scoutfs_key_buf ment_first; - struct scoutfs_key_buf ment_last; struct manifest_ref *ref; - struct manifest_ref *tmp; - - init_ment_keys(ment, &ment_first, &ment_last); ref = kzalloc(sizeof(struct manifest_ref), GFP_NOFS); if (ref) { - ref->first = scoutfs_key_dup(sb, &ment_first); - ref->last = scoutfs_key_dup(sb, &ment_last); + ref->first = scoutfs_key_dup(sb, &ment->first); + ref->last = scoutfs_key_dup(sb, &ment->last); } if (!ref || !ref->first || !ref->last) { free_ref(sb, ref); - list_for_each_entry_safe(ref, tmp, list, entry) { - list_del_init(&ref->entry); - free_ref(sb, ref); - } return -ENOMEM; } - ref->segno = le64_to_cpu(ment->segno); - ref->seq = le64_to_cpu(ment->seq); ref->level = ment->level; + ref->segno = ment->segno; + ref->seq = ment->seq; - list_add_tail(&ref->entry, list); + list_add_tail(&ref->entry, ref_list); return 0; } /* - * Return an array of pointers to the entries in the manifest that - * intersect with the given key range. The entries will be ordered by - * the order that they should be read: level 0 from newest to oldest - * then increasing higher order levels. + * Return the previous entry if it's in the right level and it overlaps + * with the start key by having a last key that's >=. If no such entry + * exists it just returns the next entry after the key and doesn't test + * it at all. If this returns 0 then the caller has to put the iref. + */ +static int btree_prev_overlap_or_next(struct super_block *sb, + struct scoutfs_btree_root *root, + void *key, unsigned key_len, + struct scoutfs_key_buf *start, u8 level, + struct scoutfs_btree_item_ref *iref) +{ + struct scoutfs_manifest_entry ment; + int ret; + + ret = scoutfs_btree_prev(sb, root, key, key_len, iref); + if (ret < 0 && ret != -ENOENT) + return ret; + + if (ret == 0) { + init_ment_iref(&ment, iref); + if (ment.level != level || + scoutfs_key_compare(&ment.last, start) < 0) + ret = -ENOENT; + } + if (ret == -ENOENT) { + scoutfs_btree_put_iref(iref); + ret = scoutfs_btree_next(sb, root, key, key_len, iref); + } + + return ret; +} + +/* + * starting with the caller's key. The entries will be ordered by the + * order that they should be read: level 0 from newest to oldest then + * increasing higher order levels. * * We have to get all the level 0 segments that intersect with the range * of items that we want to search because the level 0 segments can @@ -427,74 +447,96 @@ int scoutfs_manifest_add_ment_ref(struct super_block *sb, * existing segment that intersects with the range, even if it doesn't * contain the key. The key might fall between segments at that level. * - * This is called by the server who is processing manifest search - * messages from mounts. The server locks down the manifest while it - * gets these pointers and then uses them to allocate and fill a reply - * message. + * XXX Today this using the roots from the mount-wide super. This is + * super wrong. Doing so lets it use the dirty btree that could be + * modified by the manifest server running on this node so it has to + * lock. It should be using a specific root communicated by lock lvbs + * (or read from the super on mount). Then the btrees it traverses will + * be stable and read-only. (But can still get -ESTALE if they're + * re-written under us, would need to re-sample roots from the super in + * that case, I imagine.) */ -struct scoutfs_manifest_entry ** -scoutfs_manifest_find_range_entries(struct super_block *sb, - struct scoutfs_key_buf *key, - struct scoutfs_key_buf *end, - unsigned *found_bytes) +static int get_manifest_refs(struct super_block *sb, struct scoutfs_key_buf *key, + struct scoutfs_key_buf *end, + struct list_head *ref_list) { DECLARE_MANIFEST(sb, mani); struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; - struct scoutfs_manifest_entry **found; - struct scoutfs_manifest_entry *ment; - struct manifest_search_key skey; - unsigned nr; + struct scoutfs_manifest_btree_key *mkey; + struct scoutfs_manifest_entry ment; + SCOUTFS_BTREE_ITEM_REF(iref); + SCOUTFS_BTREE_ITEM_REF(prev); + unsigned mkey_len; + int ret; int i; - lockdep_assert_held(&mani->rwsem); + scoutfs_manifest_init_entry(&ment, 0, 0, 0, key, NULL); + mkey = alloc_btree_key_val(&ment, &mkey_len, NULL, NULL); + if (!mkey) + return -ENOMEM; - *found_bytes = 0; - - /* at most we get all level 0, one from other levels, and null term */ - nr = le64_to_cpu(super->manifest.level_counts[0]) + mani->nr_levels + 1; - - found = kcalloc(nr, sizeof(struct scoutfs_manifest_entry *), GFP_NOFS); - if (!found) { - found = ERR_PTR(-ENOMEM); - goto out; - } - - nr = 0; + scoutfs_manifest_lock(sb); /* get level 0 segments that overlap with the missing range */ - skey.key = NULL; - skey.level = 0; - skey.seq = ~0ULL; - ment = scoutfs_ring_lookup_prev(&mani->ring, &skey); - while (ment) { - if (cmp_range_ment(key, end, ment) == 0) { - found[nr++] = ment; - *found_bytes += scoutfs_manifest_bytes(ment); + mkey_len = init_btree_key(mkey, 0, ~0ULL, NULL); + ret = scoutfs_btree_prev(sb, &super->manifest.root, + mkey, mkey_len, &iref); + while (ret == 0) { + init_ment_iref(&ment, &iref); + + if (scoutfs_key_compare_ranges(key, end, &ment.first, + &ment.last) == 0) { + ret = alloc_manifest_ref(sb, ref_list, &ment); + if (ret) + goto out; } - ment = scoutfs_ring_prev(&mani->ring, ment); + swap(prev, iref); + ret = scoutfs_btree_before(sb, &super->manifest.root, + prev.key, prev.key_len, &iref); + scoutfs_btree_put_iref(&prev); } + if (ret != -ENOENT) + goto out; - /* get higher level segments that overlap with the starting key */ + /* + * XXX Today we need to read the next segment if our starting key + * falls between segments. That won't be the case once we tie + * cached items to their locks. + */ + mkey_len = init_btree_key(mkey, 1, 0, key); for (i = 1; i < mani->nr_levels; i++) { - skey.key = key; - skey.level = i; - skey.seq = 0; + mkey->level = i; /* XXX should use level counts to skip searches */ - ment = scoutfs_ring_lookup_next(&mani->ring, &skey); - if (ment) { - found[nr++] = ment; - *found_bytes += scoutfs_manifest_bytes(ment); + scoutfs_btree_put_iref(&iref); + ret = btree_prev_overlap_or_next(sb, &super->manifest.root, + mkey, mkey_len, key, i, + &iref); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; } - } - /* null terminate */ - found[nr++] = NULL; + init_ment_iref(&ment, &iref); + + if (ment.level != i) + continue; + + ret = alloc_manifest_ref(sb, ref_list, &ment); + if (ret) + goto out; + } + ret = 0; out: - return found; + scoutfs_btree_put_iref(&iref); + scoutfs_btree_put_iref(&prev); + scoutfs_manifest_unlock(sb); + kfree(mkey); + return ret; } /* @@ -549,7 +591,7 @@ int scoutfs_manifest_read_items(struct super_block *sb, trace_scoutfs_read_items(sb, key, end); /* get refs on all the segments */ - ret = scoutfs_net_manifest_range_entries(sb, key, end, &ref_list); + ret = get_manifest_refs(sb, key, end, &ref_list); if (ret) goto out; @@ -705,40 +747,6 @@ out: return ret; } -int scoutfs_manifest_has_dirty(struct super_block *sb) -{ - DECLARE_MANIFEST(sb, mani); - int ret; - - down_write(&mani->rwsem); - ret = scoutfs_ring_has_dirty(&mani->ring); - up_write(&mani->rwsem); - - return ret; -} - -int scoutfs_manifest_submit_write(struct super_block *sb, - struct scoutfs_bio_completion *comp) -{ - DECLARE_MANIFEST(sb, mani); - int ret; - - down_write(&mani->rwsem); - ret = scoutfs_ring_submit_write(sb, &mani->ring, comp); - up_write(&mani->rwsem); - - return ret; -} - -void scoutfs_manifest_write_complete(struct super_block *sb) -{ - DECLARE_MANIFEST(sb, mani); - - down_write(&mani->rwsem); - scoutfs_ring_write_complete(&mani->ring); - up_write(&mani->rwsem); -} - /* * Give the caller the segments that will be involved in the next * compaction. @@ -766,13 +774,13 @@ int scoutfs_manifest_next_compact(struct super_block *sb, void *data) DECLARE_MANIFEST(sb, mani); struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = &sbi->super; - struct scoutfs_manifest_entry *ment; - struct scoutfs_manifest_entry *over; - struct manifest_search_key skey; - struct scoutfs_key_buf ment_first; - struct scoutfs_key_buf ment_last; - struct scoutfs_key_buf over_first; - struct scoutfs_key_buf over_last; + struct scoutfs_manifest_entry ment; + struct scoutfs_manifest_entry over; + struct scoutfs_manifest_btree_key *mkey = NULL; + SCOUTFS_BTREE_ITEM_REF(iref); + SCOUTFS_BTREE_ITEM_REF(over_iref); + SCOUTFS_BTREE_ITEM_REF(prev); + unsigned mkey_len; bool sticky; int level; int ret; @@ -794,54 +802,70 @@ int scoutfs_manifest_next_compact(struct super_block *sb, void *data) goto out; } + /* alloc a full size mkey, fill it with whatever search key */ - /* find the oldest level 0 or the next higher order level by key */ - if (level == 0) { - ment = scoutfs_ring_first(&mani->ring); - if (ment && ment->level) - ment = NULL; - } else { - skey.key = mani->compact_keys[level]; - skey.level = level; - skey.seq = 0; - ment = scoutfs_ring_lookup_next(&mani->ring, &skey); - if (ment == NULL || ment->level != level) { - scoutfs_key_set_min(skey.key); - ment = scoutfs_ring_lookup_next(&mani->ring, &skey); - } - } - if (ment == NULL || ment->level != level) { - /* XXX shouldn't be possible */ - ret = 0; + mkey = alloc_btree_key_val_lens(SCOUTFS_MAX_KEY_SIZE, 0); + if (!mkey) { + ret = -ENOMEM; goto out; } - init_ment_keys(ment, &ment_first, &ment_last); + /* find the oldest level 0 or the next higher order level by key */ + if (level == 0) { + /* find the oldest level 0 */ + mkey_len = init_btree_key(mkey, 0, 0, NULL); + ret = scoutfs_btree_next(sb, &super->manifest.root, + mkey, mkey_len, &iref); + } else { + /* find the next segment after the compaction at this level */ + mkey_len = init_btree_key(mkey, level, 0, + mani->compact_keys[level]); + + ret = scoutfs_btree_next(sb, &super->manifest.root, + mkey, mkey_len, &iref); + if (ret == 0) { + init_ment_iref(&ment, &iref); + if (ment.level != level) + ret = -ENOENT; + } + if (ret == -ENOENT) { + /* .. possibly wrapping to the first key in level */ + mkey_len = init_btree_key(mkey, level, 0, NULL); + scoutfs_btree_put_iref(&iref); + ret = scoutfs_btree_next(sb, &super->manifest.root, + mkey, mkey_len, &iref); + } + } + if (ret == 0) { + init_ment_iref(&ment, &iref); + if (ment.level != level) + goto out; + } + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } /* add the upper input segment */ - ret = scoutfs_compact_add(sb, data, &ment_first, &ment_last, - le64_to_cpu(ment->segno), - le64_to_cpu(ment->seq), level); + ret = scoutfs_compact_add(sb, data, &ment); if (ret) goto out; nr++; - /* start with the first overlapping at the next level */ - skey.key = &ment_first; - skey.level = level + 1; - skey.seq = 0; - over = scoutfs_ring_lookup_next(&mani->ring, &skey); - /* and add a fanout's worth of lower overlapping segments */ + mkey_len = init_btree_key(mkey, level + 1, 0, &ment.first); + ret = btree_prev_overlap_or_next(sb, &super->manifest.root, + mkey, mkey_len, + &ment.first, level + 1, &over_iref); sticky = false; - for (i = 0; i < SCOUTFS_MANIFEST_FANOUT + 1; i++) { - if (!over || over->level != (ment->level + 1)) + for (i = 0; ret == 0 && i < SCOUTFS_MANIFEST_FANOUT + 1; i++) { + init_ment_iref(&over, &over_iref); + if (over.level != level + 1) break; - init_ment_keys(over, &over_first, &over_last); - - if (scoutfs_key_compare_ranges(&ment_first, &ment_last, - &over_first, &over_last) != 0) + if (scoutfs_key_compare_ranges(&ment.first, &ment.last, + &over.first, &over.last) != 0) break; /* upper level has to stay around when more than fanout */ @@ -850,114 +874,42 @@ int scoutfs_manifest_next_compact(struct super_block *sb, void *data) break; } - ret = scoutfs_compact_add(sb, data, &over_first, &over_last, - le64_to_cpu(over->segno), - le64_to_cpu(over->seq), level + 1); + ret = scoutfs_compact_add(sb, data, &over); if (ret) goto out; nr++; - over = scoutfs_ring_next(&mani->ring, over); + swap(prev, over_iref); + ret = scoutfs_btree_after(sb, &super->manifest.root, + prev.key, prev.key_len, &over_iref); + scoutfs_btree_put_iref(&prev); } + if (ret < 0 && ret != -ENOENT) + goto out; scoutfs_compact_describe(sb, data, level, mani->nr_levels - 1, sticky); /* record the next key to start from */ - scoutfs_key_copy(mani->compact_keys[level], &ment_last); + scoutfs_key_copy(mani->compact_keys[level], &ment.last); scoutfs_key_inc(mani->compact_keys[level]); ret = 0; out: up_write(&mani->rwsem); + + kfree(mkey); + scoutfs_btree_put_iref(&iref); + scoutfs_btree_put_iref(&over_iref); + scoutfs_btree_put_iref(&prev); + return ret ?: nr; } -/* - * Manifest entries for all levels are stored in a single ring. - * - * First they're sorted by their level. - * - * Level 0 segments can contain any items which overlap so they are - * sorted by their sequence number. Compaction can find the first node - * and reading walks backwards through level 0 to get them from newest - * to oldest to resolve matching items. - * - * Higher level segments don't overlap. They are sorted by their first - * key. - * - * Searching comparisons are different than insertion and deletion - * comparisons for higher level segments. Searches want to find the - * segment that intersects with a given key. Insertions and deletions - * want to operate on the segment with a specific first key and sequence - * number. We tell the difference by the presence of a sequence number. - * A segment will never have a seq of 0. - */ -static int manifest_ring_compare_key(void *key, void *data) -{ - struct manifest_search_key *skey = key; - struct scoutfs_manifest_entry *ment = data; - struct scoutfs_key_buf first; - struct scoutfs_key_buf last; - int cmp; - - scoutfs_key_init(&first, NULL, 0); - - if (skey->level < ment->level) { - cmp = -1; - goto out; - } - if (skey->level > ment->level) { - cmp = 1; - goto out; - } - - if (skey->level == 0) { - cmp = scoutfs_cmp_u64s(skey->seq, le64_to_cpu(ment->seq)); - goto out; - } - - init_ment_keys(ment, &first, &last); - - if (skey->seq == 0) { - cmp = scoutfs_key_compare_ranges(skey->key, skey->key, - &first, &last); - } else { - cmp = scoutfs_key_compare(skey->key, &first) ?: - scoutfs_cmp_u64s(skey->seq, le64_to_cpu(ment->seq)); - } - -out: -#if 0 - /* pretty expensive to be on by default */ - SK_TRACE_PRINTK("%u,%llu,"SK_FMT" %c %u,%llu,"SK_FMT"\n", - skey->level, skey->seq, SK_ARG(skey->key), - cmp < 0 ? '<' : cmp == 0 ? '=' : '>', - ment->level, le64_to_cpu(ment->seq), SK_ARG(&first)); -#endif - return cmp; -} - -static int manifest_ring_compare_data(void *a, void *b) -{ - struct manifest_search_key skey; - struct scoutfs_manifest_entry *ment = a; - struct scoutfs_key_buf key; - - init_ment_keys(ment, &key, NULL); - - skey.seq = le64_to_cpu(ment->seq); - skey.key = &key; - skey.level = ment->level; - - return manifest_ring_compare_key(&skey, b); -} - int scoutfs_manifest_setup(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = &sbi->super; struct manifest *mani; - int ret; int i; mani = kzalloc(sizeof(struct manifest), GFP_KERNEL); @@ -965,14 +917,6 @@ int scoutfs_manifest_setup(struct super_block *sb) return -ENOMEM; init_rwsem(&mani->rwsem); - scoutfs_ring_init(&mani->ring, &super->manifest.ring, - manifest_ring_compare_key, - manifest_ring_compare_data); - ret = scoutfs_ring_load(sb, &mani->ring); - if (ret) { - kfree(mani); - return ret; - } for (i = 0; i < ARRAY_SIZE(mani->compact_keys); i++) { mani->compact_keys[i] = scoutfs_key_alloc(sb, @@ -980,7 +924,6 @@ int scoutfs_manifest_setup(struct super_block *sb) if (!mani->compact_keys[i]) { while (--i >= 0) scoutfs_key_free(sb, mani->compact_keys[i]); - scoutfs_ring_destroy(&mani->ring); kfree(mani); return -ENOMEM; } @@ -1015,7 +958,6 @@ void scoutfs_manifest_destroy(struct super_block *sb) int i; if (mani) { - scoutfs_ring_destroy(&mani->ring); for (i = 0; i < ARRAY_SIZE(mani->compact_keys); i++) scoutfs_key_free(sb, mani->compact_keys[i]); kfree(mani); diff --git a/kmod/src/manifest.h b/kmod/src/manifest.h index 46aef7d1..39e0e134 100644 --- a/kmod/src/manifest.h +++ b/kmod/src/manifest.h @@ -1,47 +1,39 @@ #ifndef _SCOUTFS_MANIFEST_H_ #define _SCOUTFS_MANIFEST_H_ -struct scoutfs_key_buf; +#include "key.h" + struct scoutfs_bio_completion; +/* + * This native manifest entry references the physical storage of a + * manifest entry which can exist in a segment header and its edge keys, + * a network transmission of a packed entry and its keys, or in btree + * blocks spread between an item key and value. + */ +struct scoutfs_manifest_entry { + u8 level; + u64 segno; + u64 seq; + struct scoutfs_key_buf first; + struct scoutfs_key_buf last; +}; + +void scoutfs_manifest_init_entry(struct scoutfs_manifest_entry *ment, + u64 level, u64 segno, u64 seq, + struct scoutfs_key_buf *first, + struct scoutfs_key_buf *last); int scoutfs_manifest_add(struct super_block *sb, - struct scoutfs_key_buf *first, - struct scoutfs_key_buf *last, u64 segno, u64 seq, - u8 level); -int scoutfs_manifest_add_ment(struct super_block *sb, - struct scoutfs_manifest_entry *add); -int scoutfs_manifest_dirty(struct super_block *sb, - struct scoutfs_key_buf *first, u64 seq, u8 level); -int scoutfs_manifest_del(struct super_block *sb, struct scoutfs_key_buf *first, - u64 seq, u8 level); -int scoutfs_manifest_has_dirty(struct super_block *sb); -int scoutfs_manifest_submit_write(struct super_block *sb, - struct scoutfs_bio_completion *comp); -void scoutfs_manifest_write_complete(struct super_block *sb); - -int scoutfs_manifest_bytes(struct scoutfs_manifest_entry *ment); - -struct scoutfs_manifest_entry * -scoutfs_manifest_alloc_entry(struct super_block *sb, - struct scoutfs_key_buf *first, - struct scoutfs_key_buf *last, u64 segno, u64 seq, - u8 level); + struct scoutfs_manifest_entry *ment); +int scoutfs_manifest_del(struct super_block *sb, + struct scoutfs_manifest_entry *ment); int scoutfs_manifest_lock(struct super_block *sb); int scoutfs_manifest_unlock(struct super_block *sb); -struct scoutfs_manifest_entry ** -scoutfs_manifest_find_range_entries(struct super_block *sb, - struct scoutfs_key_buf *key, - struct scoutfs_key_buf *end, - unsigned *found_bytes); - int scoutfs_manifest_read_items(struct super_block *sb, struct scoutfs_key_buf *key, struct scoutfs_key_buf *end); -int scoutfs_manifest_add_ment_ref(struct super_block *sb, - struct list_head *list, - struct scoutfs_manifest_entry *ment); int scoutfs_manifest_next_compact(struct super_block *sb, void *data); diff --git a/kmod/src/net.c b/kmod/src/net.c index d971bb36..09a4d8b1 100644 --- a/kmod/src/net.c +++ b/kmod/src/net.c @@ -25,6 +25,7 @@ #include "net.h" #include "counters.h" #include "inode.h" +#include "btree.h" #include "manifest.h" #include "bio.h" #include "alloc.h" @@ -331,8 +332,8 @@ static void scoutfs_net_ring_commit_func(struct work_struct *work) down_write(&nti->ring_commit_rwsem); - if (scoutfs_manifest_has_dirty(sb) || scoutfs_alloc_has_dirty(sb)) { - ret = scoutfs_manifest_submit_write(sb, &comp) ?: + if (scoutfs_btree_has_dirty(sb)) { + ret = scoutfs_btree_write_dirty(sb) ?: scoutfs_alloc_submit_write(sb, &comp) ?: scoutfs_bio_wait_comp(sb, &comp) ?: scoutfs_write_dirty_super(sb); @@ -340,7 +341,7 @@ static void scoutfs_net_ring_commit_func(struct work_struct *work) /* we'd need to loop or something */ BUG_ON(ret); - scoutfs_manifest_write_complete(sb); + scoutfs_btree_write_complete(sb); scoutfs_alloc_write_complete(sb); scoutfs_advance_dirty_super(sb); @@ -425,6 +426,69 @@ static struct send_buf *process_bulk_alloc(struct super_block *sb,void *req, return sbuf; } +static void init_net_ment_keys(struct scoutfs_net_manifest_entry *net_ment, + struct scoutfs_key_buf *first, + struct scoutfs_key_buf *last) +{ + scoutfs_key_init(first, net_ment->keys, + le16_to_cpu(net_ment->first_key_len)); + scoutfs_key_init(last, net_ment->keys + + le16_to_cpu(net_ment->first_key_len), + le16_to_cpu(net_ment->last_key_len)); +} + +/* + * Allocate a contiguous manifest entry for communication over the network. + */ +static struct scoutfs_net_manifest_entry * +alloc_net_ment(struct scoutfs_manifest_entry *ment) +{ + struct scoutfs_net_manifest_entry *net_ment; + struct scoutfs_key_buf first; + struct scoutfs_key_buf last; + + net_ment = kmalloc(offsetof(struct scoutfs_net_manifest_entry, + keys[ment->first.key_len + + ment->last.key_len]), GFP_NOFS); + if (!net_ment) + return NULL; + + net_ment->segno = cpu_to_le64(ment->segno); + net_ment->seq = cpu_to_le64(ment->seq); + net_ment->first_key_len = cpu_to_le16(ment->first.key_len); + net_ment->last_key_len = cpu_to_le16(ment->last.key_len); + net_ment->level = ment->level; + + init_net_ment_keys(net_ment, &first, &last); + scoutfs_key_copy(&first, &ment->first); + scoutfs_key_copy(&last, &ment->last); + + return net_ment; +} + +/* point a native manifest entry at a contiguous net manifest */ +static void init_ment_net_ment(struct scoutfs_manifest_entry *ment, + struct scoutfs_net_manifest_entry *net_ment) +{ + struct scoutfs_key_buf first; + struct scoutfs_key_buf last; + + init_net_ment_keys(net_ment, &first, &last); + scoutfs_key_clone(&ment->first, &first); + scoutfs_key_clone(&ment->last, &last); + + ment->segno = le64_to_cpu(net_ment->segno); + ment->seq = le64_to_cpu(net_ment->seq); + ment->level = net_ment->level; +} + +static unsigned net_ment_bytes(struct scoutfs_net_manifest_entry *net_ment) +{ + return offsetof(struct scoutfs_net_manifest_entry, + keys[le16_to_cpu(net_ment->first_key_len) + + le16_to_cpu(net_ment->last_key_len)]); +} + /* * This is new segments arriving. It needs to wait for level 0 to be * free. It has relatively little visibility into the manifest, though. @@ -443,19 +507,20 @@ static struct send_buf *process_record_segment(struct super_block *sb, void *req, int req_len) { DECLARE_NET_INFO(sb, nti); - struct scoutfs_manifest_entry *ment; + struct scoutfs_manifest_entry ment; + struct scoutfs_net_manifest_entry *net_ment; struct commit_waiter cw; struct send_buf *sbuf; int ret; - if (req_len < sizeof(struct scoutfs_manifest_entry)) { + if (req_len < sizeof(struct scoutfs_net_manifest_entry)) { sbuf = ERR_PTR(-EINVAL); goto out; } - ment = req; + net_ment = req; - if (req_len != scoutfs_manifest_bytes(ment)) { + if (req_len != net_ment_bytes(net_ment)) { sbuf = ERR_PTR(-EINVAL); goto out; } @@ -472,7 +537,9 @@ retry: goto retry; } - ret = scoutfs_manifest_add_ment(sb, ment); + init_ment_net_ment(&ment, net_ment); + + ret = scoutfs_manifest_add(sb, &ment); scoutfs_manifest_unlock(sb); if (ret == 0) @@ -542,73 +609,6 @@ out: return sbuf; } -/* - * Find the manifest entries that intersect with the request's key - * range. We lock the manifest and get pointers to the manifest entries - * that intersect. We then allocate a reply buffer and copy them over. - */ -static struct send_buf *process_manifest_range_entries(struct super_block *sb, - void *req, int req_len) -{ - struct scoutfs_net_key_range *kr = req; - struct scoutfs_net_manifest_entries *ments; - struct scoutfs_manifest_entry **found = NULL; - struct scoutfs_manifest_entry *ment; - struct scoutfs_key_buf start; - struct scoutfs_key_buf end; - struct send_buf *sbuf; - unsigned total; - unsigned bytes; - int i; - - /* XXX this is a write lock and should be a read lock */ - scoutfs_manifest_lock(sb); - - if (req_len < sizeof(struct scoutfs_net_key_range) || - req_len < offsetof(struct scoutfs_net_key_range, - key_bytes[le16_to_cpu(kr->start_len) + - le16_to_cpu(kr->end_len)])) { - sbuf = ERR_PTR(-EINVAL); - goto out; - } - - scoutfs_key_init(&start, kr->key_bytes, le16_to_cpu(kr->start_len)); - scoutfs_key_init(&end, kr->key_bytes + le16_to_cpu(kr->start_len), - le16_to_cpu(kr->end_len)); - - found = scoutfs_manifest_find_range_entries(sb, &start, &end, &total); - if (IS_ERR(found)) { - sbuf = ERR_CAST(found); - goto out; - } - - total += sizeof(struct scoutfs_net_manifest_entries); - - sbuf = alloc_sbuf(total); - if (!sbuf) { - sbuf = ERR_PTR(-ENOMEM); - goto out; - } - - ments = (void *)sbuf->nh->data; - ment = ments->ments; - - for (i = 0; found[i]; i++) { - bytes = scoutfs_manifest_bytes(found[i]); - memcpy(ment, found[i], bytes); - ment = (void *)((char *)ment + bytes); - } - - ments->nr = cpu_to_le16(i); - sbuf->nh->status = SCOUTFS_NET_STATUS_SUCCESS; - -out: - scoutfs_manifest_unlock(sb); - if (!IS_ERR_OR_NULL(found)) - kfree(found); - return sbuf; -} - /* * XXX should this call into inodes? not sure about the layering here. */ @@ -790,8 +790,6 @@ static proc_func_t type_proc_func(u8 type) { static proc_func_t funcs[] = { [SCOUTFS_NET_ALLOC_INODES] = process_alloc_inodes, - [SCOUTFS_NET_MANIFEST_RANGE_ENTRIES] = - process_manifest_range_entries, [SCOUTFS_NET_ALLOC_SEGNO] = process_alloc_segno, [SCOUTFS_NET_RECORD_SEGMENT] = process_record_segment, [SCOUTFS_NET_BULK_ALLOC] = process_bulk_alloc, @@ -889,7 +887,8 @@ static void destroy_server_state(struct super_block *sb) scoutfs_compact_destroy(sb); scoutfs_alloc_destroy(sb); - scoutfs_manifest_destroy(sb); + /* XXX this drops dirty data on the floor.. has it committed? */ + scoutfs_btree_write_complete(sb); /* XXX these should be persistent and reclaimed during recovery */ list_for_each_entry_safe(ps, tmp, &nti->pending_seqs, head) { @@ -918,6 +917,7 @@ static void scoutfs_net_proc_func(struct work_struct *work) mutex_lock(&nti->mutex); if (!nti->server_loaded) { ret = scoutfs_read_supers(sb, &SCOUTFS_SB(sb)->super) ?: + scoutfs_btree_prepare_write(sb) ?: scoutfs_manifest_setup(sb) ?: scoutfs_alloc_setup(sb) ?: scoutfs_compact_setup(sb); @@ -1526,22 +1526,24 @@ static int record_segment_reply(struct super_block *sb, void *reply, int ret, int scoutfs_net_record_segment(struct super_block *sb, struct scoutfs_segment *seg, u8 level) { - struct scoutfs_manifest_entry *ment; + struct scoutfs_net_manifest_entry *net_ment; struct record_segment_args args; + struct scoutfs_manifest_entry ment; int ret; - ment = scoutfs_seg_manifest_entry(sb, seg, level); - if (!ment) { + scoutfs_seg_init_ment(&ment, level, seg); + net_ment = alloc_net_ment(&ment); + if (!net_ment) { ret = -ENOMEM; goto out; } init_completion(&args.comp); - ret = add_send_buf(sb, SCOUTFS_NET_RECORD_SEGMENT, ment, - scoutfs_manifest_bytes(ment), + ret = add_send_buf(sb, SCOUTFS_NET_RECORD_SEGMENT, net_ment, + net_ment_bytes(net_ment), record_segment_reply, &args); - kfree(ment); + kfree(net_ment); if (ret == 0) { wait_for_completion(&args.comp); ret = args.ret; @@ -1592,119 +1594,6 @@ int scoutfs_net_alloc_segno(struct super_block *sb, u64 *segno) return ret; } -struct manifest_range_entries_args { - struct list_head *list; - struct completion comp; - int ret; -}; - -/* - * The server has given us entries that intersect with our request's - * key range. Our caller is still blocked waiting for our completion. - * We walk the manifest entries and add native manifest refs to their - * list and wake them. - */ -static int manifest_range_entries_reply(struct super_block *sb, void *reply, - int reply_bytes, void *arg) -{ - struct manifest_range_entries_args *args = arg; - struct scoutfs_net_manifest_entries *ments = reply; - struct scoutfs_manifest_entry *ment; - unsigned bytes; - int ret = 0; - int i; - - if (reply_bytes < 0) { - ret = reply_bytes; - goto out; - } - - reply_bytes -= sizeof(struct scoutfs_net_manifest_entries); - if (reply_bytes < 0) { - ret = -EINVAL; - goto out; - } - - ment = ments->ments; - for (i = 0; i < le16_to_cpu(ments->nr); i++) { - - - if (reply_bytes < sizeof(struct scoutfs_manifest_entry)) { - ret = -EINVAL; - goto out; - } - - bytes = scoutfs_manifest_bytes(ment); - reply_bytes -= bytes; - if (reply_bytes < 0) { - ret = -EINVAL; - goto out; - } - - ret = scoutfs_manifest_add_ment_ref(sb, args->list, ment); - if (ret) - break; - - ment = (void *)((char *)ment + bytes); - } - -out: - args->ret = ret; - complete(&args->comp); /* args can be freed from this point */ - return ret; -} - -/* - * Ask the manifest server for the manifest entries whose key range - * intersects with the callers key range. The reply func will fill the - * caller's list with the reply's entries. - * - * XXX for now this can't be interrupted. The reply func which is off - * in work in a worker thread is blocking to allocate and put things on - * a list in our stack. We'd need better lifetime support to let it - * find out that we've returned and that it should stop processing the - * reply. - */ -int scoutfs_net_manifest_range_entries(struct super_block *sb, - struct scoutfs_key_buf *start, - struct scoutfs_key_buf *end, - struct list_head *list) -{ - struct manifest_range_entries_args args; - struct scoutfs_net_key_range *kr; - struct scoutfs_key_buf start_key; - struct scoutfs_key_buf end_key; - unsigned len; - int ret; - - len = sizeof(struct scoutfs_net_key_range) + - start->key_len + end->key_len; - kr = kmalloc(len, GFP_NOFS); - if (!kr) - return -ENOMEM; - - kr->start_len = cpu_to_le16(start->key_len); - kr->end_len = cpu_to_le16(end->key_len); - - scoutfs_key_init(&start_key, kr->key_bytes, start->key_len); - scoutfs_key_init(&end_key, kr->key_bytes + start->key_len, - end->key_len); - scoutfs_key_copy(&start_key, start); - scoutfs_key_copy(&end_key, end); - - args.list = list; - init_completion(&args.comp); - - ret = add_send_buf(sb, SCOUTFS_NET_MANIFEST_RANGE_ENTRIES, kr, len, - manifest_range_entries_reply, &args); - kfree(kr); - if (ret) - return ret; - - wait_for_completion(&args.comp); - return args.ret; -} - static int alloc_inodes_reply(struct super_block *sb, void *reply, int ret, void *arg) { diff --git a/kmod/src/net.h b/kmod/src/net.h index ea131144..bcfa34f9 100644 --- a/kmod/src/net.h +++ b/kmod/src/net.h @@ -5,10 +5,6 @@ struct scoutfs_key_buf; struct scoutfs_segment; int scoutfs_net_alloc_inodes(struct super_block *sb); -int scoutfs_net_manifest_range_entries(struct super_block *sb, - struct scoutfs_key_buf *start, - struct scoutfs_key_buf *end, - struct list_head *list); int scoutfs_net_alloc_segno(struct super_block *sb, u64 *segno); int scoutfs_net_record_segment(struct super_block *sb, struct scoutfs_segment *seg, u8 level); diff --git a/kmod/src/ring.c b/kmod/src/ring.c index 657bfe82..26e256f6 100644 --- a/kmod/src/ring.c +++ b/kmod/src/ring.c @@ -350,8 +350,7 @@ void *scoutfs_ring_prev(struct scoutfs_ring_info *ring, void *data) /* * Calculate the most blocks we could have to use to store a given number - * of bytes of entries. At worst each block has a header and leaves one - * less than the max manifest entry unused. + * of bytes of entries. */ static unsigned most_blocks(unsigned long bytes) { @@ -359,8 +358,7 @@ static unsigned most_blocks(unsigned long bytes) space = SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_ring_block) - - (sizeof(struct scoutfs_manifest_entry) + - (2 * SCOUTFS_MAX_KEY_SIZE) - 1); + sizeof(struct scoutfs_alloc_region); return DIV_ROUND_UP(bytes, space); } diff --git a/kmod/src/seg.c b/kmod/src/seg.c index fffbbf34..1cda3462 100644 --- a/kmod/src/seg.c +++ b/kmod/src/seg.c @@ -673,11 +673,8 @@ bool scoutfs_seg_append_item(struct super_block *sb, struct scoutfs_segment *seg return true; } -/* - * Add a dirty manifest entry for the given segment at the given level. - */ -int scoutfs_seg_manifest_add(struct super_block *sb, - struct scoutfs_segment *seg, u8 level) +void scoutfs_seg_init_ment(struct scoutfs_manifest_entry *ment, int level, + struct scoutfs_segment *seg) { struct scoutfs_segment_block *sblk = off_ptr(seg, 0); struct scoutfs_key_buf first; @@ -685,38 +682,8 @@ int scoutfs_seg_manifest_add(struct super_block *sb, first_last_keys(seg, &first, &last); - return scoutfs_manifest_add(sb, &first, &last, le64_to_cpu(sblk->segno), - le64_to_cpu(sblk->seq), level); -} - -int scoutfs_seg_manifest_del(struct super_block *sb, - struct scoutfs_segment *seg, u8 level) -{ - struct scoutfs_segment_block *sblk = off_ptr(seg, 0); - struct scoutfs_key_buf first; - - first_last_keys(seg, &first, NULL); - - return scoutfs_manifest_del(sb, &first, le64_to_cpu(sblk->seq), level); -} - -/* - * Return an allocated manifest entry that describes the segment, returns - * NULL if it couldn't allocate. - */ -struct scoutfs_manifest_entry * -scoutfs_seg_manifest_entry(struct super_block *sb, - struct scoutfs_segment *seg, u8 level) -{ - struct scoutfs_segment_block *sblk = off_ptr(seg, 0); - struct scoutfs_key_buf first; - struct scoutfs_key_buf last; - - first_last_keys(seg, &first, &last); - - return scoutfs_manifest_alloc_entry(sb, &first, &last, - le64_to_cpu(sblk->segno), - le64_to_cpu(sblk->seq), level); + scoutfs_manifest_init_entry(ment, level, le64_to_cpu(sblk->segno), + le64_to_cpu(sblk->seq), &first, &last); } /* diff --git a/kmod/src/seg.h b/kmod/src/seg.h index 9d1cd4c9..5a2909d4 100644 --- a/kmod/src/seg.h +++ b/kmod/src/seg.h @@ -3,6 +3,7 @@ struct scoutfs_bio_completion; struct scoutfs_key_buf; +struct scoutfs_manifest_entry; struct kvec; /* this is only visible for trace events */ @@ -39,19 +40,13 @@ bool scoutfs_seg_fits_single(u32 nr_items, u32 key_bytes, u32 val_bytes); bool scoutfs_seg_append_item(struct super_block *sb, struct scoutfs_segment *seg, struct scoutfs_key_buf *key, struct kvec *val, u8 flags, __le32 **links); -int scoutfs_seg_manifest_add(struct super_block *sb, - struct scoutfs_segment *seg, u8 level); -int scoutfs_seg_manifest_del(struct super_block *sb, - struct scoutfs_segment *seg, u8 level); +void scoutfs_seg_init_ment(struct scoutfs_manifest_entry *ment, int level, + struct scoutfs_segment *seg); int scoutfs_seg_submit_write(struct super_block *sb, struct scoutfs_segment *seg, struct scoutfs_bio_completion *comp); -struct scoutfs_manifest_entry * -scoutfs_seg_manifest_entry(struct super_block *sb, - struct scoutfs_segment *seg, u8 level); - int scoutfs_seg_setup(struct super_block *sb); void scoutfs_seg_destroy(struct super_block *sb);