diff --git a/kmod/src/btree.c b/kmod/src/btree.c index e30f5af0..55a2a0a8 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -1768,7 +1768,7 @@ int scoutfs_btree_write_dirty(struct super_block *sb) struct scoutfs_super_block *super = &sbi->super; struct scoutfs_btree_ring *bring = &super->bring; struct scoutfs_btree_root *roots[] = { - /* XXX super roots go here */ + &super->manifest.root, NULL, }; struct scoutfs_btree_root *root; diff --git a/kmod/src/compact.c b/kmod/src/compact.c index 56b248a8..12f8d902 100644 --- a/kmod/src/compact.c +++ b/kmod/src/compact.c @@ -457,15 +457,13 @@ void scoutfs_compact_describe(struct super_block *sb, void *data, * and is then possibly adding all the lower overlapping segments. */ int scoutfs_compact_add(struct super_block *sb, void *data, - struct scoutfs_key_buf *first, - struct scoutfs_key_buf *last, u64 segno, u64 seq, - u8 level) + struct scoutfs_manifest_entry *ment) { struct compact_cursor *curs = data; struct compact_seg *cseg; int ret; - cseg = alloc_cseg(sb, first, last); + cseg = alloc_cseg(sb, &ment->first, &ment->last); if (!cseg) { ret = -ENOMEM; goto out; @@ -473,9 +471,9 @@ int scoutfs_compact_add(struct super_block *sb, void *data, list_add_tail(&cseg->entry, &curs->csegs); - cseg->segno = segno; - cseg->seq = seq; - cseg->level = level; + cseg->segno = ment->segno; + cseg->seq = ment->seq; + cseg->level = ment->level; if (!curs->upper) curs->upper = cseg; @@ -501,8 +499,8 @@ void scoutfs_compact_add_segno(struct super_block *sb, void *data, u64 segno) /* * Commit the result of a compaction based on the state of the cursor. - * The net caller stops the rings from being written while we're making - * changes. We lock the manifest to atomically make our changes. + * The net caller stops the manifest from being written while we're + * making changes. We lock the manifest to atomically make our changes. * * The erorr handling is sketchy here because calling the manifest from * here is temporary. We should be sending a message to the server @@ -510,6 +508,7 @@ void scoutfs_compact_add_segno(struct super_block *sb, void *data, u64 segno) */ int scoutfs_compact_commit(struct super_block *sb, void *c, void *r) { + struct scoutfs_manifest_entry ment; struct compact_cursor *curs = c; struct list_head *results = r; struct compact_seg *cseg; @@ -533,8 +532,9 @@ int scoutfs_compact_commit(struct super_block *sb, void *c, void *r) BUG_ON(ret); } - ret = scoutfs_manifest_del(sb, cseg->first, - cseg->seq, cseg->level); + scoutfs_manifest_init_entry(&ment, cseg->level, 0, cseg->seq, + cseg->first, NULL); + ret = scoutfs_manifest_del(sb, &ment); BUG_ON(ret); } @@ -542,12 +542,12 @@ int scoutfs_compact_commit(struct super_block *sb, void *c, void *r) list_for_each_entry(cseg, results, entry) { /* XXX moved upper segments won't have read the segment :P */ if (cseg->seg) - ret = scoutfs_seg_manifest_add(sb, cseg->seg, - cseg->level); + scoutfs_seg_init_ment(&ment, cseg->level, cseg->seg); else - ret = scoutfs_manifest_add(sb, cseg->first, - cseg->last, cseg->segno, - cseg->seq, cseg->level); + scoutfs_manifest_init_entry(&ment, cseg->level, + cseg->segno, cseg->seq, + cseg->first, cseg->last); + ret = scoutfs_manifest_add(sb, &ment); BUG_ON(ret); } diff --git a/kmod/src/compact.h b/kmod/src/compact.h index f6f4bb60..c163ce56 100644 --- a/kmod/src/compact.h +++ b/kmod/src/compact.h @@ -6,9 +6,7 @@ void scoutfs_compact_kick(struct super_block *sb); void scoutfs_compact_describe(struct super_block *sb, void *data, u8 upper_level, u8 last_level, bool sticky); int scoutfs_compact_add(struct super_block *sb, void *data, - struct scoutfs_key_buf *first, - struct scoutfs_key_buf *last, u64 segno, u64 seq, - u8 level); + struct scoutfs_manifest_entry *ment); void scoutfs_compact_add_segno(struct super_block *sb, void *data, u64 segno); int scoutfs_compact_commit(struct super_block *sb, void *c, void *r); diff --git a/kmod/src/format.h b/kmod/src/format.h index 8a7a2df5..7ca2f2e3 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -169,16 +169,38 @@ struct scoutfs_btree_ring { #define SCOUTFS_MANIFEST_FANOUT 10 struct scoutfs_manifest { - struct scoutfs_ring_descriptor ring; + struct scoutfs_btree_root root; __le64 level_counts[SCOUTFS_MANIFEST_MAX_LEVEL]; } __packed; -struct scoutfs_manifest_entry { +/* + * Manifest entries are packed into btree keys and values in a very + * fiddly way so that we can sort them with memcmp first by level then + * by their position in the level. First comes the level. + * + * Level 0 segments are sorted by their seq so they don't have the first + * segment key in the manifest btree key. Both of their keys are in the + * value. + * + * Level 1 segments are sorted by their first key so their last key is + * in the value. + * + * We go to all this trouble so that we can communicate a version of the + * manifest with one btree root, have dense btree keys which are used as + * seperators in parent blocks, and don't duplicate the large keys in + * the manifest btree key and value. + */ + +struct scoutfs_manifest_btree_key { + __u8 level; + __u8 bkey[0]; +} __packed; + +struct scoutfs_manifest_btree_val { __le64 segno; __le64 seq; __le16 first_key_len; __le16 last_key_len; - __u8 level; __u8 keys[0]; } __packed; @@ -536,9 +558,13 @@ struct scoutfs_net_key_range { __u8 key_bytes[0]; } __packed; -struct scoutfs_net_manifest_entries { - __le16 nr; - struct scoutfs_manifest_entry ments[0]; +struct scoutfs_net_manifest_entry { + __le64 segno; + __le64 seq; + __le16 first_key_len; + __le16 last_key_len; + __u8 level; + __u8 keys[0]; } __packed; /* XXX I dunno, totally made up */ @@ -561,7 +587,6 @@ struct scoutfs_net_segnos { enum { SCOUTFS_NET_ALLOC_INODES = 0, - SCOUTFS_NET_MANIFEST_RANGE_ENTRIES, SCOUTFS_NET_ALLOC_SEGNO, SCOUTFS_NET_RECORD_SEGMENT, SCOUTFS_NET_BULK_ALLOC, diff --git a/kmod/src/manifest.c b/kmod/src/manifest.c index f757c1b9..31c55283 100644 --- a/kmod/src/manifest.c +++ b/kmod/src/manifest.c @@ -20,7 +20,7 @@ #include "kvec.h" #include "seg.h" #include "item.h" -#include "ring.h" +#include "btree.h" #include "cmp.h" #include "compact.h" #include "manifest.h" @@ -30,16 +30,17 @@ #include "scoutfs_trace.h" /* - * Manifest entries are stored in ring nodes. + * Manifest entries are stored in the cow btrees in the persistently + * allocated ring of blocks in the shared device. This lets clients + * read consistent old versions of the manifest when it's safe to do so. * - * They're sorted first by level then by their first key. This enables - * the primary searches based on key value for looking up items in - * segments via the manifest. + * Manifest entries are sorted first by level then by their first key. + * This enables the primary searches based on key value for looking up + * items in segments via the manifest. */ struct manifest { struct rw_semaphore rwsem; - struct scoutfs_ring_info ring; u8 nr_levels; /* calculated on mount, const thereafter */ @@ -78,41 +79,6 @@ struct manifest_ref { struct scoutfs_key_buf *last; }; -/* - * Seq is only specified for operations that differentiate between - * segments with identical items by their sequence number. - */ -struct manifest_search_key { - u64 seq; - struct scoutfs_key_buf *key; - u8 level; -}; - -static void init_ment_keys(struct scoutfs_manifest_entry *ment, - struct scoutfs_key_buf *first, - struct scoutfs_key_buf *last) -{ - if (first) - scoutfs_key_init(first, ment->keys, - le16_to_cpu(ment->first_key_len)); - if (last) - scoutfs_key_init(last, ment->keys + - le16_to_cpu(ment->first_key_len), - le16_to_cpu(ment->last_key_len)); -} - -static bool cmp_range_ment(struct scoutfs_key_buf *key, - struct scoutfs_key_buf *end, - struct scoutfs_manifest_entry *ment) -{ - struct scoutfs_key_buf first; - struct scoutfs_key_buf last; - - init_ment_keys(ment, &first, &last); - - return scoutfs_key_compare_ranges(key, end, &first, &last); -} - /* * Change the level count under the manifest lock. We then maintain a * bit that can be tested outside the lock to determine if the caller @@ -153,6 +119,152 @@ bool scoutfs_manifest_level0_full(struct super_block *sb) return test_bit(MANI_FLAG_LEVEL0_FULL, &mani->flags); } +void scoutfs_manifest_init_entry(struct scoutfs_manifest_entry *ment, + u64 level, u64 segno, u64 seq, + struct scoutfs_key_buf *first, + struct scoutfs_key_buf *last) +{ + ment->level = level; + ment->segno = segno; + ment->seq = seq; + + if (first) + scoutfs_key_clone(&ment->first, first); + else + scoutfs_key_init(&ment->first, NULL, 0); + + if (last) + scoutfs_key_clone(&ment->last, last); + else + scoutfs_key_init(&ment->last, NULL, 0); +} + +/* + * level 0 segments have the extra seq up in the btree key. + */ +static struct scoutfs_manifest_btree_key * +alloc_btree_key_val_lens(unsigned first_len, unsigned last_len) +{ + return kmalloc(sizeof(struct scoutfs_manifest_btree_key) + + sizeof(u64) + + sizeof(struct scoutfs_manifest_btree_val) + + first_len + last_len, GFP_NOFS); +} + +/* + * Initialize the btree key and value for a manifest entry in one contiguous + * allocation. + */ +static struct scoutfs_manifest_btree_key * +alloc_btree_key_val(struct scoutfs_manifest_entry *ment, unsigned *mkey_len, + struct scoutfs_manifest_btree_val **mval_ret, + unsigned *mval_len_ret) +{ + struct scoutfs_manifest_btree_key *mkey; + struct scoutfs_manifest_btree_val *mval; + struct scoutfs_key_buf b_first; + struct scoutfs_key_buf b_last; + unsigned bkey_len; + unsigned mval_len; + __be64 seq; + + mkey = alloc_btree_key_val_lens(ment->first.key_len, ment->last.key_len); + if (!mkey) + return NULL; + + if (ment->level == 0) { + seq = cpu_to_be64(ment->seq); + bkey_len = sizeof(seq); + memcpy(mkey->bkey, &seq, bkey_len); + } else { + bkey_len = ment->first.key_len; + } + + *mkey_len = offsetof(struct scoutfs_manifest_btree_key, bkey[bkey_len]); + mval = (void *)mkey + *mkey_len; + + if (ment->level == 0) { + scoutfs_key_init(&b_first, mval->keys, ment->first.key_len); + scoutfs_key_init(&b_last, mval->keys + ment->first.key_len, + ment->last.key_len); + mval_len = sizeof(struct scoutfs_manifest_btree_val) + + ment->first.key_len + ment->last.key_len; + } else { + scoutfs_key_init(&b_first, mkey->bkey, ment->first.key_len); + scoutfs_key_init(&b_last, mval->keys, ment->last.key_len); + mval_len = sizeof(struct scoutfs_manifest_btree_val) + + ment->last.key_len; + } + + mkey->level = ment->level; + mval->segno = cpu_to_le64(ment->segno); + mval->seq = cpu_to_le64(ment->seq); + mval->first_key_len = cpu_to_le16(ment->first.key_len); + mval->last_key_len = cpu_to_le16(ment->last.key_len); + + scoutfs_key_copy(&b_first, &ment->first); + scoutfs_key_copy(&b_last, &ment->last); + + if (mval_ret) { + *mval_ret = mval; + *mval_len_ret = mval_len; + } + return mkey; +} + +/* initialize a native manifest entry to point to the btree key and value */ +static void init_ment_iref(struct scoutfs_manifest_entry *ment, + struct scoutfs_btree_item_ref *iref) +{ + struct scoutfs_manifest_btree_key *mkey = iref->key; + struct scoutfs_manifest_btree_val *mval = iref->val; + + ment->level = mkey->level; + ment->segno = le64_to_cpu(mval->segno); + ment->seq = le64_to_cpu(mval->seq); + + if (ment->level == 0) { + scoutfs_key_init(&ment->first, mval->keys, + le16_to_cpu(mval->first_key_len)); + scoutfs_key_init(&ment->last, mval->keys + + le16_to_cpu(mval->first_key_len), + le16_to_cpu(mval->last_key_len)); + } else { + scoutfs_key_init(&ment->first, mkey->bkey, + le16_to_cpu(mval->first_key_len)); + scoutfs_key_init(&ment->last, mval->keys, + le16_to_cpu(mval->last_key_len)); + } +} + +/* + * Fill the callers max-size btree key with the given values and return + * its length. + */ +static unsigned init_btree_key(struct scoutfs_manifest_btree_key *mkey, + u8 level, u64 seq, struct scoutfs_key_buf *first) +{ + struct scoutfs_key_buf b_first; + unsigned bkey_len; + __be64 bseq; + + mkey->level = level; + + if (level == 0) { + bseq = cpu_to_be64(seq); + bkey_len = sizeof(bseq); + memcpy(mkey->bkey, &bseq, bkey_len); + } else if (first) { + scoutfs_key_init(&b_first, mkey->bkey, first->key_len); + scoutfs_key_copy(&b_first, first); + bkey_len = first->key_len; + } else { + bkey_len = 0; + } + + return offsetof(struct scoutfs_manifest_btree_key, bkey[bkey_len]); +} + /* * Insert a new manifest entry in the ring. The ring allocates a new * node for us and we fill it. @@ -160,180 +272,68 @@ bool scoutfs_manifest_level0_full(struct super_block *sb) * This must be called with the manifest lock held. */ int scoutfs_manifest_add(struct super_block *sb, - struct scoutfs_key_buf *first, - struct scoutfs_key_buf *last, u64 segno, u64 seq, - u8 level) + struct scoutfs_manifest_entry *ment) { DECLARE_MANIFEST(sb, mani); - struct scoutfs_manifest_entry *ment; - struct scoutfs_key_buf ment_first; - struct scoutfs_key_buf ment_last; - struct manifest_search_key skey; - unsigned key_bytes; - unsigned bytes; - - trace_scoutfs_manifest_add(sb, level, segno, seq, first, last); - - key_bytes = first->key_len + last->key_len; - bytes = offsetof(struct scoutfs_manifest_entry, keys[key_bytes]); - - skey.key = first; - skey.level = level; - skey.seq = seq; - - ment = scoutfs_ring_insert(&mani->ring, &skey, bytes); - if (!ment) - return -ENOMEM; - - ment->segno = cpu_to_le64(segno); - ment->seq = cpu_to_le64(seq); - ment->first_key_len = cpu_to_le16(first->key_len); - ment->last_key_len = cpu_to_le16(last->key_len); - ment->level = level; - - init_ment_keys(ment, &ment_first, &ment_last); - scoutfs_key_copy(&ment_first, first); - scoutfs_key_copy(&ment_last, last); - - mani->nr_levels = max_t(u8, mani->nr_levels, level + 1); - add_level_count(sb, level, 1); - return 0; -} - -/* - * Add a manifest entry as provided by the caller instead of exploded - * out into arguments. - * - * This must be called with the manifest lock held. - */ -int scoutfs_manifest_add_ment(struct super_block *sb, - struct scoutfs_manifest_entry *add) -{ - DECLARE_MANIFEST(sb, mani); - struct scoutfs_manifest_entry *ment; - struct manifest_search_key skey; - struct scoutfs_key_buf first; - struct scoutfs_key_buf last; - unsigned bytes; + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + struct scoutfs_manifest_btree_key *mkey; + struct scoutfs_manifest_btree_val *mval; + unsigned mkey_len; + unsigned mval_len; + int ret; lockdep_assert_held(&mani->rwsem); - init_ment_keys(add, &first, &last); - trace_scoutfs_manifest_add(sb, add->level, le64_to_cpu(add->segno), - le64_to_cpu(add->seq), &first, &last); - - skey.key = &first; - skey.level = add->level; - skey.seq = le64_to_cpu(add->seq); - - bytes = scoutfs_manifest_bytes(add); - - ment = scoutfs_ring_insert(&mani->ring, &skey, bytes); - if (!ment) + mkey = alloc_btree_key_val(ment, &mkey_len, &mval, &mval_len); + if (!mkey) return -ENOMEM; - memcpy(ment, add, bytes); + trace_scoutfs_manifest_add(sb, ment->level, ment->segno, ment->seq, + &ment->first, &ment->last); - mani->nr_levels = max_t(u8, mani->nr_levels, add->level + 1); - add_level_count(sb, add->level, 1); + ret = scoutfs_btree_insert(sb, &super->manifest.root, mkey, mkey_len, + mval, mval_len); + if (ret == 0) { + mani->nr_levels = max_t(u8, mani->nr_levels, ment->level + 1); + add_level_count(sb, ment->level, 1); + } - return 0; + kfree(mkey); + return ret; } /* * This must be called with the manifest lock held. + * + * When this is called from the network we can take the keys directly as + * they were sent from the clients. */ -int scoutfs_manifest_dirty(struct super_block *sb, - struct scoutfs_key_buf *first, u64 seq, u8 level) +int scoutfs_manifest_del(struct super_block *sb, + struct scoutfs_manifest_entry *ment) { DECLARE_MANIFEST(sb, mani); - struct scoutfs_manifest_entry *ment; - struct manifest_search_key skey; + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + struct scoutfs_manifest_btree_key *mkey; + unsigned mkey_len; + int ret; - skey.key = first; - skey.level = level; - skey.seq = seq; + trace_scoutfs_manifest_delete(sb, ment->level, ment->segno, ment->seq, + &ment->first, &ment->last); - ment = scoutfs_ring_lookup(&mani->ring, &skey); - if (!ment) - return -ENOENT; + lockdep_assert_held(&mani->rwsem); - scoutfs_ring_dirty(&mani->ring, ment); - return 0; -} + mkey = alloc_btree_key_val(ment, &mkey_len, NULL, NULL); + if (!mkey) + return -ENOMEM; -/* - * This must be called with the manifest lock held. - */ -int scoutfs_manifest_del(struct super_block *sb, struct scoutfs_key_buf *first, - u64 seq, u8 level) -{ - DECLARE_MANIFEST(sb, mani); - struct scoutfs_manifest_entry *ment; - struct manifest_search_key skey; - struct scoutfs_key_buf last; + ret = scoutfs_btree_delete(sb, &super->manifest.root, mkey, mkey_len); + if (ret == 0) + add_level_count(sb, ment->level, -1ULL); - skey.key = first; - skey.level = level; - skey.seq = seq; - - ment = scoutfs_ring_lookup(&mani->ring, &skey); - if (!ment) - return -ENOENT; - - init_ment_keys(ment, NULL, &last); - trace_scoutfs_manifest_delete(sb, ment->level, le64_to_cpu(ment->segno), - le64_to_cpu(ment->seq), first, &last); - - scoutfs_ring_delete(&mani->ring, ment); - add_level_count(sb, level, -1ULL); - return 0; -} - -/* - * Return the total number of bytes used by the given manifest entry, - * including its struct. - */ -int scoutfs_manifest_bytes(struct scoutfs_manifest_entry *ment) -{ - return sizeof(struct scoutfs_manifest_entry) + - le16_to_cpu(ment->first_key_len) + - le16_to_cpu(ment->last_key_len); -} - -/* - * Return an allocated and filled in manifest entry. - */ -struct scoutfs_manifest_entry * -scoutfs_manifest_alloc_entry(struct super_block *sb, - struct scoutfs_key_buf *first, - struct scoutfs_key_buf *last, u64 segno, u64 seq, - u8 level) -{ - struct scoutfs_manifest_entry *ment; - struct scoutfs_key_buf ment_first; - struct scoutfs_key_buf ment_last; - unsigned key_bytes; - unsigned bytes; - - key_bytes = first->key_len + last->key_len; - bytes = offsetof(struct scoutfs_manifest_entry, keys[key_bytes]); - - ment = kmalloc(bytes, GFP_NOFS); - if (!ment) - return NULL; - - ment->segno = cpu_to_le64(segno); - ment->seq = cpu_to_le64(seq); - ment->first_key_len = cpu_to_le16(first->key_len); - ment->last_key_len = cpu_to_le16(last->key_len); - ment->level = level; - - init_ment_keys(ment, &ment_first, &ment_last); - scoutfs_key_copy(&ment_first, first); - scoutfs_key_copy(&ment_last, last); - - return ment; + kfree(mkey); + return ret; } /* @@ -372,50 +372,70 @@ static void free_ref(struct super_block *sb, struct manifest_ref *ref) } /* - * Allocate a native manifest ref so that we can work with segments described - * by the callers manifest entry. - (* - * This frees all the elements on the list if it returns an error. + * Allocate a reading manifest ref so that we can work with segments + * described by the callers manifest entry. */ -int scoutfs_manifest_add_ment_ref(struct super_block *sb, - struct list_head *list, - struct scoutfs_manifest_entry *ment) +static int alloc_manifest_ref(struct super_block *sb, struct list_head *ref_list, + struct scoutfs_manifest_entry *ment) { - struct scoutfs_key_buf ment_first; - struct scoutfs_key_buf ment_last; struct manifest_ref *ref; - struct manifest_ref *tmp; - - init_ment_keys(ment, &ment_first, &ment_last); ref = kzalloc(sizeof(struct manifest_ref), GFP_NOFS); if (ref) { - ref->first = scoutfs_key_dup(sb, &ment_first); - ref->last = scoutfs_key_dup(sb, &ment_last); + ref->first = scoutfs_key_dup(sb, &ment->first); + ref->last = scoutfs_key_dup(sb, &ment->last); } if (!ref || !ref->first || !ref->last) { free_ref(sb, ref); - list_for_each_entry_safe(ref, tmp, list, entry) { - list_del_init(&ref->entry); - free_ref(sb, ref); - } return -ENOMEM; } - ref->segno = le64_to_cpu(ment->segno); - ref->seq = le64_to_cpu(ment->seq); ref->level = ment->level; + ref->segno = ment->segno; + ref->seq = ment->seq; - list_add_tail(&ref->entry, list); + list_add_tail(&ref->entry, ref_list); return 0; } /* - * Return an array of pointers to the entries in the manifest that - * intersect with the given key range. The entries will be ordered by - * the order that they should be read: level 0 from newest to oldest - * then increasing higher order levels. + * Return the previous entry if it's in the right level and it overlaps + * with the start key by having a last key that's >=. If no such entry + * exists it just returns the next entry after the key and doesn't test + * it at all. If this returns 0 then the caller has to put the iref. + */ +static int btree_prev_overlap_or_next(struct super_block *sb, + struct scoutfs_btree_root *root, + void *key, unsigned key_len, + struct scoutfs_key_buf *start, u8 level, + struct scoutfs_btree_item_ref *iref) +{ + struct scoutfs_manifest_entry ment; + int ret; + + ret = scoutfs_btree_prev(sb, root, key, key_len, iref); + if (ret < 0 && ret != -ENOENT) + return ret; + + if (ret == 0) { + init_ment_iref(&ment, iref); + if (ment.level != level || + scoutfs_key_compare(&ment.last, start) < 0) + ret = -ENOENT; + } + if (ret == -ENOENT) { + scoutfs_btree_put_iref(iref); + ret = scoutfs_btree_next(sb, root, key, key_len, iref); + } + + return ret; +} + +/* + * starting with the caller's key. The entries will be ordered by the + * order that they should be read: level 0 from newest to oldest then + * increasing higher order levels. * * We have to get all the level 0 segments that intersect with the range * of items that we want to search because the level 0 segments can @@ -427,74 +447,96 @@ int scoutfs_manifest_add_ment_ref(struct super_block *sb, * existing segment that intersects with the range, even if it doesn't * contain the key. The key might fall between segments at that level. * - * This is called by the server who is processing manifest search - * messages from mounts. The server locks down the manifest while it - * gets these pointers and then uses them to allocate and fill a reply - * message. + * XXX Today this using the roots from the mount-wide super. This is + * super wrong. Doing so lets it use the dirty btree that could be + * modified by the manifest server running on this node so it has to + * lock. It should be using a specific root communicated by lock lvbs + * (or read from the super on mount). Then the btrees it traverses will + * be stable and read-only. (But can still get -ESTALE if they're + * re-written under us, would need to re-sample roots from the super in + * that case, I imagine.) */ -struct scoutfs_manifest_entry ** -scoutfs_manifest_find_range_entries(struct super_block *sb, - struct scoutfs_key_buf *key, - struct scoutfs_key_buf *end, - unsigned *found_bytes) +static int get_manifest_refs(struct super_block *sb, struct scoutfs_key_buf *key, + struct scoutfs_key_buf *end, + struct list_head *ref_list) { DECLARE_MANIFEST(sb, mani); struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super; - struct scoutfs_manifest_entry **found; - struct scoutfs_manifest_entry *ment; - struct manifest_search_key skey; - unsigned nr; + struct scoutfs_manifest_btree_key *mkey; + struct scoutfs_manifest_entry ment; + SCOUTFS_BTREE_ITEM_REF(iref); + SCOUTFS_BTREE_ITEM_REF(prev); + unsigned mkey_len; + int ret; int i; - lockdep_assert_held(&mani->rwsem); + scoutfs_manifest_init_entry(&ment, 0, 0, 0, key, NULL); + mkey = alloc_btree_key_val(&ment, &mkey_len, NULL, NULL); + if (!mkey) + return -ENOMEM; - *found_bytes = 0; - - /* at most we get all level 0, one from other levels, and null term */ - nr = le64_to_cpu(super->manifest.level_counts[0]) + mani->nr_levels + 1; - - found = kcalloc(nr, sizeof(struct scoutfs_manifest_entry *), GFP_NOFS); - if (!found) { - found = ERR_PTR(-ENOMEM); - goto out; - } - - nr = 0; + scoutfs_manifest_lock(sb); /* get level 0 segments that overlap with the missing range */ - skey.key = NULL; - skey.level = 0; - skey.seq = ~0ULL; - ment = scoutfs_ring_lookup_prev(&mani->ring, &skey); - while (ment) { - if (cmp_range_ment(key, end, ment) == 0) { - found[nr++] = ment; - *found_bytes += scoutfs_manifest_bytes(ment); + mkey_len = init_btree_key(mkey, 0, ~0ULL, NULL); + ret = scoutfs_btree_prev(sb, &super->manifest.root, + mkey, mkey_len, &iref); + while (ret == 0) { + init_ment_iref(&ment, &iref); + + if (scoutfs_key_compare_ranges(key, end, &ment.first, + &ment.last) == 0) { + ret = alloc_manifest_ref(sb, ref_list, &ment); + if (ret) + goto out; } - ment = scoutfs_ring_prev(&mani->ring, ment); + swap(prev, iref); + ret = scoutfs_btree_before(sb, &super->manifest.root, + prev.key, prev.key_len, &iref); + scoutfs_btree_put_iref(&prev); } + if (ret != -ENOENT) + goto out; - /* get higher level segments that overlap with the starting key */ + /* + * XXX Today we need to read the next segment if our starting key + * falls between segments. That won't be the case once we tie + * cached items to their locks. + */ + mkey_len = init_btree_key(mkey, 1, 0, key); for (i = 1; i < mani->nr_levels; i++) { - skey.key = key; - skey.level = i; - skey.seq = 0; + mkey->level = i; /* XXX should use level counts to skip searches */ - ment = scoutfs_ring_lookup_next(&mani->ring, &skey); - if (ment) { - found[nr++] = ment; - *found_bytes += scoutfs_manifest_bytes(ment); + scoutfs_btree_put_iref(&iref); + ret = btree_prev_overlap_or_next(sb, &super->manifest.root, + mkey, mkey_len, key, i, + &iref); + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; } - } - /* null terminate */ - found[nr++] = NULL; + init_ment_iref(&ment, &iref); + + if (ment.level != i) + continue; + + ret = alloc_manifest_ref(sb, ref_list, &ment); + if (ret) + goto out; + } + ret = 0; out: - return found; + scoutfs_btree_put_iref(&iref); + scoutfs_btree_put_iref(&prev); + scoutfs_manifest_unlock(sb); + kfree(mkey); + return ret; } /* @@ -549,7 +591,7 @@ int scoutfs_manifest_read_items(struct super_block *sb, trace_scoutfs_read_items(sb, key, end); /* get refs on all the segments */ - ret = scoutfs_net_manifest_range_entries(sb, key, end, &ref_list); + ret = get_manifest_refs(sb, key, end, &ref_list); if (ret) goto out; @@ -705,40 +747,6 @@ out: return ret; } -int scoutfs_manifest_has_dirty(struct super_block *sb) -{ - DECLARE_MANIFEST(sb, mani); - int ret; - - down_write(&mani->rwsem); - ret = scoutfs_ring_has_dirty(&mani->ring); - up_write(&mani->rwsem); - - return ret; -} - -int scoutfs_manifest_submit_write(struct super_block *sb, - struct scoutfs_bio_completion *comp) -{ - DECLARE_MANIFEST(sb, mani); - int ret; - - down_write(&mani->rwsem); - ret = scoutfs_ring_submit_write(sb, &mani->ring, comp); - up_write(&mani->rwsem); - - return ret; -} - -void scoutfs_manifest_write_complete(struct super_block *sb) -{ - DECLARE_MANIFEST(sb, mani); - - down_write(&mani->rwsem); - scoutfs_ring_write_complete(&mani->ring); - up_write(&mani->rwsem); -} - /* * Give the caller the segments that will be involved in the next * compaction. @@ -766,13 +774,13 @@ int scoutfs_manifest_next_compact(struct super_block *sb, void *data) DECLARE_MANIFEST(sb, mani); struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = &sbi->super; - struct scoutfs_manifest_entry *ment; - struct scoutfs_manifest_entry *over; - struct manifest_search_key skey; - struct scoutfs_key_buf ment_first; - struct scoutfs_key_buf ment_last; - struct scoutfs_key_buf over_first; - struct scoutfs_key_buf over_last; + struct scoutfs_manifest_entry ment; + struct scoutfs_manifest_entry over; + struct scoutfs_manifest_btree_key *mkey = NULL; + SCOUTFS_BTREE_ITEM_REF(iref); + SCOUTFS_BTREE_ITEM_REF(over_iref); + SCOUTFS_BTREE_ITEM_REF(prev); + unsigned mkey_len; bool sticky; int level; int ret; @@ -794,54 +802,70 @@ int scoutfs_manifest_next_compact(struct super_block *sb, void *data) goto out; } + /* alloc a full size mkey, fill it with whatever search key */ - /* find the oldest level 0 or the next higher order level by key */ - if (level == 0) { - ment = scoutfs_ring_first(&mani->ring); - if (ment && ment->level) - ment = NULL; - } else { - skey.key = mani->compact_keys[level]; - skey.level = level; - skey.seq = 0; - ment = scoutfs_ring_lookup_next(&mani->ring, &skey); - if (ment == NULL || ment->level != level) { - scoutfs_key_set_min(skey.key); - ment = scoutfs_ring_lookup_next(&mani->ring, &skey); - } - } - if (ment == NULL || ment->level != level) { - /* XXX shouldn't be possible */ - ret = 0; + mkey = alloc_btree_key_val_lens(SCOUTFS_MAX_KEY_SIZE, 0); + if (!mkey) { + ret = -ENOMEM; goto out; } - init_ment_keys(ment, &ment_first, &ment_last); + /* find the oldest level 0 or the next higher order level by key */ + if (level == 0) { + /* find the oldest level 0 */ + mkey_len = init_btree_key(mkey, 0, 0, NULL); + ret = scoutfs_btree_next(sb, &super->manifest.root, + mkey, mkey_len, &iref); + } else { + /* find the next segment after the compaction at this level */ + mkey_len = init_btree_key(mkey, level, 0, + mani->compact_keys[level]); + + ret = scoutfs_btree_next(sb, &super->manifest.root, + mkey, mkey_len, &iref); + if (ret == 0) { + init_ment_iref(&ment, &iref); + if (ment.level != level) + ret = -ENOENT; + } + if (ret == -ENOENT) { + /* .. possibly wrapping to the first key in level */ + mkey_len = init_btree_key(mkey, level, 0, NULL); + scoutfs_btree_put_iref(&iref); + ret = scoutfs_btree_next(sb, &super->manifest.root, + mkey, mkey_len, &iref); + } + } + if (ret == 0) { + init_ment_iref(&ment, &iref); + if (ment.level != level) + goto out; + } + if (ret < 0) { + if (ret == -ENOENT) + ret = 0; + goto out; + } /* add the upper input segment */ - ret = scoutfs_compact_add(sb, data, &ment_first, &ment_last, - le64_to_cpu(ment->segno), - le64_to_cpu(ment->seq), level); + ret = scoutfs_compact_add(sb, data, &ment); if (ret) goto out; nr++; - /* start with the first overlapping at the next level */ - skey.key = &ment_first; - skey.level = level + 1; - skey.seq = 0; - over = scoutfs_ring_lookup_next(&mani->ring, &skey); - /* and add a fanout's worth of lower overlapping segments */ + mkey_len = init_btree_key(mkey, level + 1, 0, &ment.first); + ret = btree_prev_overlap_or_next(sb, &super->manifest.root, + mkey, mkey_len, + &ment.first, level + 1, &over_iref); sticky = false; - for (i = 0; i < SCOUTFS_MANIFEST_FANOUT + 1; i++) { - if (!over || over->level != (ment->level + 1)) + for (i = 0; ret == 0 && i < SCOUTFS_MANIFEST_FANOUT + 1; i++) { + init_ment_iref(&over, &over_iref); + if (over.level != level + 1) break; - init_ment_keys(over, &over_first, &over_last); - - if (scoutfs_key_compare_ranges(&ment_first, &ment_last, - &over_first, &over_last) != 0) + if (scoutfs_key_compare_ranges(&ment.first, &ment.last, + &over.first, &over.last) != 0) break; /* upper level has to stay around when more than fanout */ @@ -850,114 +874,42 @@ int scoutfs_manifest_next_compact(struct super_block *sb, void *data) break; } - ret = scoutfs_compact_add(sb, data, &over_first, &over_last, - le64_to_cpu(over->segno), - le64_to_cpu(over->seq), level + 1); + ret = scoutfs_compact_add(sb, data, &over); if (ret) goto out; nr++; - over = scoutfs_ring_next(&mani->ring, over); + swap(prev, over_iref); + ret = scoutfs_btree_after(sb, &super->manifest.root, + prev.key, prev.key_len, &over_iref); + scoutfs_btree_put_iref(&prev); } + if (ret < 0 && ret != -ENOENT) + goto out; scoutfs_compact_describe(sb, data, level, mani->nr_levels - 1, sticky); /* record the next key to start from */ - scoutfs_key_copy(mani->compact_keys[level], &ment_last); + scoutfs_key_copy(mani->compact_keys[level], &ment.last); scoutfs_key_inc(mani->compact_keys[level]); ret = 0; out: up_write(&mani->rwsem); + + kfree(mkey); + scoutfs_btree_put_iref(&iref); + scoutfs_btree_put_iref(&over_iref); + scoutfs_btree_put_iref(&prev); + return ret ?: nr; } -/* - * Manifest entries for all levels are stored in a single ring. - * - * First they're sorted by their level. - * - * Level 0 segments can contain any items which overlap so they are - * sorted by their sequence number. Compaction can find the first node - * and reading walks backwards through level 0 to get them from newest - * to oldest to resolve matching items. - * - * Higher level segments don't overlap. They are sorted by their first - * key. - * - * Searching comparisons are different than insertion and deletion - * comparisons for higher level segments. Searches want to find the - * segment that intersects with a given key. Insertions and deletions - * want to operate on the segment with a specific first key and sequence - * number. We tell the difference by the presence of a sequence number. - * A segment will never have a seq of 0. - */ -static int manifest_ring_compare_key(void *key, void *data) -{ - struct manifest_search_key *skey = key; - struct scoutfs_manifest_entry *ment = data; - struct scoutfs_key_buf first; - struct scoutfs_key_buf last; - int cmp; - - scoutfs_key_init(&first, NULL, 0); - - if (skey->level < ment->level) { - cmp = -1; - goto out; - } - if (skey->level > ment->level) { - cmp = 1; - goto out; - } - - if (skey->level == 0) { - cmp = scoutfs_cmp_u64s(skey->seq, le64_to_cpu(ment->seq)); - goto out; - } - - init_ment_keys(ment, &first, &last); - - if (skey->seq == 0) { - cmp = scoutfs_key_compare_ranges(skey->key, skey->key, - &first, &last); - } else { - cmp = scoutfs_key_compare(skey->key, &first) ?: - scoutfs_cmp_u64s(skey->seq, le64_to_cpu(ment->seq)); - } - -out: -#if 0 - /* pretty expensive to be on by default */ - SK_TRACE_PRINTK("%u,%llu,"SK_FMT" %c %u,%llu,"SK_FMT"\n", - skey->level, skey->seq, SK_ARG(skey->key), - cmp < 0 ? '<' : cmp == 0 ? '=' : '>', - ment->level, le64_to_cpu(ment->seq), SK_ARG(&first)); -#endif - return cmp; -} - -static int manifest_ring_compare_data(void *a, void *b) -{ - struct manifest_search_key skey; - struct scoutfs_manifest_entry *ment = a; - struct scoutfs_key_buf key; - - init_ment_keys(ment, &key, NULL); - - skey.seq = le64_to_cpu(ment->seq); - skey.key = &key; - skey.level = ment->level; - - return manifest_ring_compare_key(&skey, b); -} - int scoutfs_manifest_setup(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct scoutfs_super_block *super = &sbi->super; struct manifest *mani; - int ret; int i; mani = kzalloc(sizeof(struct manifest), GFP_KERNEL); @@ -965,14 +917,6 @@ int scoutfs_manifest_setup(struct super_block *sb) return -ENOMEM; init_rwsem(&mani->rwsem); - scoutfs_ring_init(&mani->ring, &super->manifest.ring, - manifest_ring_compare_key, - manifest_ring_compare_data); - ret = scoutfs_ring_load(sb, &mani->ring); - if (ret) { - kfree(mani); - return ret; - } for (i = 0; i < ARRAY_SIZE(mani->compact_keys); i++) { mani->compact_keys[i] = scoutfs_key_alloc(sb, @@ -980,7 +924,6 @@ int scoutfs_manifest_setup(struct super_block *sb) if (!mani->compact_keys[i]) { while (--i >= 0) scoutfs_key_free(sb, mani->compact_keys[i]); - scoutfs_ring_destroy(&mani->ring); kfree(mani); return -ENOMEM; } @@ -1015,7 +958,6 @@ void scoutfs_manifest_destroy(struct super_block *sb) int i; if (mani) { - scoutfs_ring_destroy(&mani->ring); for (i = 0; i < ARRAY_SIZE(mani->compact_keys); i++) scoutfs_key_free(sb, mani->compact_keys[i]); kfree(mani); diff --git a/kmod/src/manifest.h b/kmod/src/manifest.h index 46aef7d1..39e0e134 100644 --- a/kmod/src/manifest.h +++ b/kmod/src/manifest.h @@ -1,47 +1,39 @@ #ifndef _SCOUTFS_MANIFEST_H_ #define _SCOUTFS_MANIFEST_H_ -struct scoutfs_key_buf; +#include "key.h" + struct scoutfs_bio_completion; +/* + * This native manifest entry references the physical storage of a + * manifest entry which can exist in a segment header and its edge keys, + * a network transmission of a packed entry and its keys, or in btree + * blocks spread between an item key and value. + */ +struct scoutfs_manifest_entry { + u8 level; + u64 segno; + u64 seq; + struct scoutfs_key_buf first; + struct scoutfs_key_buf last; +}; + +void scoutfs_manifest_init_entry(struct scoutfs_manifest_entry *ment, + u64 level, u64 segno, u64 seq, + struct scoutfs_key_buf *first, + struct scoutfs_key_buf *last); int scoutfs_manifest_add(struct super_block *sb, - struct scoutfs_key_buf *first, - struct scoutfs_key_buf *last, u64 segno, u64 seq, - u8 level); -int scoutfs_manifest_add_ment(struct super_block *sb, - struct scoutfs_manifest_entry *add); -int scoutfs_manifest_dirty(struct super_block *sb, - struct scoutfs_key_buf *first, u64 seq, u8 level); -int scoutfs_manifest_del(struct super_block *sb, struct scoutfs_key_buf *first, - u64 seq, u8 level); -int scoutfs_manifest_has_dirty(struct super_block *sb); -int scoutfs_manifest_submit_write(struct super_block *sb, - struct scoutfs_bio_completion *comp); -void scoutfs_manifest_write_complete(struct super_block *sb); - -int scoutfs_manifest_bytes(struct scoutfs_manifest_entry *ment); - -struct scoutfs_manifest_entry * -scoutfs_manifest_alloc_entry(struct super_block *sb, - struct scoutfs_key_buf *first, - struct scoutfs_key_buf *last, u64 segno, u64 seq, - u8 level); + struct scoutfs_manifest_entry *ment); +int scoutfs_manifest_del(struct super_block *sb, + struct scoutfs_manifest_entry *ment); int scoutfs_manifest_lock(struct super_block *sb); int scoutfs_manifest_unlock(struct super_block *sb); -struct scoutfs_manifest_entry ** -scoutfs_manifest_find_range_entries(struct super_block *sb, - struct scoutfs_key_buf *key, - struct scoutfs_key_buf *end, - unsigned *found_bytes); - int scoutfs_manifest_read_items(struct super_block *sb, struct scoutfs_key_buf *key, struct scoutfs_key_buf *end); -int scoutfs_manifest_add_ment_ref(struct super_block *sb, - struct list_head *list, - struct scoutfs_manifest_entry *ment); int scoutfs_manifest_next_compact(struct super_block *sb, void *data); diff --git a/kmod/src/net.c b/kmod/src/net.c index d971bb36..09a4d8b1 100644 --- a/kmod/src/net.c +++ b/kmod/src/net.c @@ -25,6 +25,7 @@ #include "net.h" #include "counters.h" #include "inode.h" +#include "btree.h" #include "manifest.h" #include "bio.h" #include "alloc.h" @@ -331,8 +332,8 @@ static void scoutfs_net_ring_commit_func(struct work_struct *work) down_write(&nti->ring_commit_rwsem); - if (scoutfs_manifest_has_dirty(sb) || scoutfs_alloc_has_dirty(sb)) { - ret = scoutfs_manifest_submit_write(sb, &comp) ?: + if (scoutfs_btree_has_dirty(sb)) { + ret = scoutfs_btree_write_dirty(sb) ?: scoutfs_alloc_submit_write(sb, &comp) ?: scoutfs_bio_wait_comp(sb, &comp) ?: scoutfs_write_dirty_super(sb); @@ -340,7 +341,7 @@ static void scoutfs_net_ring_commit_func(struct work_struct *work) /* we'd need to loop or something */ BUG_ON(ret); - scoutfs_manifest_write_complete(sb); + scoutfs_btree_write_complete(sb); scoutfs_alloc_write_complete(sb); scoutfs_advance_dirty_super(sb); @@ -425,6 +426,69 @@ static struct send_buf *process_bulk_alloc(struct super_block *sb,void *req, return sbuf; } +static void init_net_ment_keys(struct scoutfs_net_manifest_entry *net_ment, + struct scoutfs_key_buf *first, + struct scoutfs_key_buf *last) +{ + scoutfs_key_init(first, net_ment->keys, + le16_to_cpu(net_ment->first_key_len)); + scoutfs_key_init(last, net_ment->keys + + le16_to_cpu(net_ment->first_key_len), + le16_to_cpu(net_ment->last_key_len)); +} + +/* + * Allocate a contiguous manifest entry for communication over the network. + */ +static struct scoutfs_net_manifest_entry * +alloc_net_ment(struct scoutfs_manifest_entry *ment) +{ + struct scoutfs_net_manifest_entry *net_ment; + struct scoutfs_key_buf first; + struct scoutfs_key_buf last; + + net_ment = kmalloc(offsetof(struct scoutfs_net_manifest_entry, + keys[ment->first.key_len + + ment->last.key_len]), GFP_NOFS); + if (!net_ment) + return NULL; + + net_ment->segno = cpu_to_le64(ment->segno); + net_ment->seq = cpu_to_le64(ment->seq); + net_ment->first_key_len = cpu_to_le16(ment->first.key_len); + net_ment->last_key_len = cpu_to_le16(ment->last.key_len); + net_ment->level = ment->level; + + init_net_ment_keys(net_ment, &first, &last); + scoutfs_key_copy(&first, &ment->first); + scoutfs_key_copy(&last, &ment->last); + + return net_ment; +} + +/* point a native manifest entry at a contiguous net manifest */ +static void init_ment_net_ment(struct scoutfs_manifest_entry *ment, + struct scoutfs_net_manifest_entry *net_ment) +{ + struct scoutfs_key_buf first; + struct scoutfs_key_buf last; + + init_net_ment_keys(net_ment, &first, &last); + scoutfs_key_clone(&ment->first, &first); + scoutfs_key_clone(&ment->last, &last); + + ment->segno = le64_to_cpu(net_ment->segno); + ment->seq = le64_to_cpu(net_ment->seq); + ment->level = net_ment->level; +} + +static unsigned net_ment_bytes(struct scoutfs_net_manifest_entry *net_ment) +{ + return offsetof(struct scoutfs_net_manifest_entry, + keys[le16_to_cpu(net_ment->first_key_len) + + le16_to_cpu(net_ment->last_key_len)]); +} + /* * This is new segments arriving. It needs to wait for level 0 to be * free. It has relatively little visibility into the manifest, though. @@ -443,19 +507,20 @@ static struct send_buf *process_record_segment(struct super_block *sb, void *req, int req_len) { DECLARE_NET_INFO(sb, nti); - struct scoutfs_manifest_entry *ment; + struct scoutfs_manifest_entry ment; + struct scoutfs_net_manifest_entry *net_ment; struct commit_waiter cw; struct send_buf *sbuf; int ret; - if (req_len < sizeof(struct scoutfs_manifest_entry)) { + if (req_len < sizeof(struct scoutfs_net_manifest_entry)) { sbuf = ERR_PTR(-EINVAL); goto out; } - ment = req; + net_ment = req; - if (req_len != scoutfs_manifest_bytes(ment)) { + if (req_len != net_ment_bytes(net_ment)) { sbuf = ERR_PTR(-EINVAL); goto out; } @@ -472,7 +537,9 @@ retry: goto retry; } - ret = scoutfs_manifest_add_ment(sb, ment); + init_ment_net_ment(&ment, net_ment); + + ret = scoutfs_manifest_add(sb, &ment); scoutfs_manifest_unlock(sb); if (ret == 0) @@ -542,73 +609,6 @@ out: return sbuf; } -/* - * Find the manifest entries that intersect with the request's key - * range. We lock the manifest and get pointers to the manifest entries - * that intersect. We then allocate a reply buffer and copy them over. - */ -static struct send_buf *process_manifest_range_entries(struct super_block *sb, - void *req, int req_len) -{ - struct scoutfs_net_key_range *kr = req; - struct scoutfs_net_manifest_entries *ments; - struct scoutfs_manifest_entry **found = NULL; - struct scoutfs_manifest_entry *ment; - struct scoutfs_key_buf start; - struct scoutfs_key_buf end; - struct send_buf *sbuf; - unsigned total; - unsigned bytes; - int i; - - /* XXX this is a write lock and should be a read lock */ - scoutfs_manifest_lock(sb); - - if (req_len < sizeof(struct scoutfs_net_key_range) || - req_len < offsetof(struct scoutfs_net_key_range, - key_bytes[le16_to_cpu(kr->start_len) + - le16_to_cpu(kr->end_len)])) { - sbuf = ERR_PTR(-EINVAL); - goto out; - } - - scoutfs_key_init(&start, kr->key_bytes, le16_to_cpu(kr->start_len)); - scoutfs_key_init(&end, kr->key_bytes + le16_to_cpu(kr->start_len), - le16_to_cpu(kr->end_len)); - - found = scoutfs_manifest_find_range_entries(sb, &start, &end, &total); - if (IS_ERR(found)) { - sbuf = ERR_CAST(found); - goto out; - } - - total += sizeof(struct scoutfs_net_manifest_entries); - - sbuf = alloc_sbuf(total); - if (!sbuf) { - sbuf = ERR_PTR(-ENOMEM); - goto out; - } - - ments = (void *)sbuf->nh->data; - ment = ments->ments; - - for (i = 0; found[i]; i++) { - bytes = scoutfs_manifest_bytes(found[i]); - memcpy(ment, found[i], bytes); - ment = (void *)((char *)ment + bytes); - } - - ments->nr = cpu_to_le16(i); - sbuf->nh->status = SCOUTFS_NET_STATUS_SUCCESS; - -out: - scoutfs_manifest_unlock(sb); - if (!IS_ERR_OR_NULL(found)) - kfree(found); - return sbuf; -} - /* * XXX should this call into inodes? not sure about the layering here. */ @@ -790,8 +790,6 @@ static proc_func_t type_proc_func(u8 type) { static proc_func_t funcs[] = { [SCOUTFS_NET_ALLOC_INODES] = process_alloc_inodes, - [SCOUTFS_NET_MANIFEST_RANGE_ENTRIES] = - process_manifest_range_entries, [SCOUTFS_NET_ALLOC_SEGNO] = process_alloc_segno, [SCOUTFS_NET_RECORD_SEGMENT] = process_record_segment, [SCOUTFS_NET_BULK_ALLOC] = process_bulk_alloc, @@ -889,7 +887,8 @@ static void destroy_server_state(struct super_block *sb) scoutfs_compact_destroy(sb); scoutfs_alloc_destroy(sb); - scoutfs_manifest_destroy(sb); + /* XXX this drops dirty data on the floor.. has it committed? */ + scoutfs_btree_write_complete(sb); /* XXX these should be persistent and reclaimed during recovery */ list_for_each_entry_safe(ps, tmp, &nti->pending_seqs, head) { @@ -918,6 +917,7 @@ static void scoutfs_net_proc_func(struct work_struct *work) mutex_lock(&nti->mutex); if (!nti->server_loaded) { ret = scoutfs_read_supers(sb, &SCOUTFS_SB(sb)->super) ?: + scoutfs_btree_prepare_write(sb) ?: scoutfs_manifest_setup(sb) ?: scoutfs_alloc_setup(sb) ?: scoutfs_compact_setup(sb); @@ -1526,22 +1526,24 @@ static int record_segment_reply(struct super_block *sb, void *reply, int ret, int scoutfs_net_record_segment(struct super_block *sb, struct scoutfs_segment *seg, u8 level) { - struct scoutfs_manifest_entry *ment; + struct scoutfs_net_manifest_entry *net_ment; struct record_segment_args args; + struct scoutfs_manifest_entry ment; int ret; - ment = scoutfs_seg_manifest_entry(sb, seg, level); - if (!ment) { + scoutfs_seg_init_ment(&ment, level, seg); + net_ment = alloc_net_ment(&ment); + if (!net_ment) { ret = -ENOMEM; goto out; } init_completion(&args.comp); - ret = add_send_buf(sb, SCOUTFS_NET_RECORD_SEGMENT, ment, - scoutfs_manifest_bytes(ment), + ret = add_send_buf(sb, SCOUTFS_NET_RECORD_SEGMENT, net_ment, + net_ment_bytes(net_ment), record_segment_reply, &args); - kfree(ment); + kfree(net_ment); if (ret == 0) { wait_for_completion(&args.comp); ret = args.ret; @@ -1592,119 +1594,6 @@ int scoutfs_net_alloc_segno(struct super_block *sb, u64 *segno) return ret; } -struct manifest_range_entries_args { - struct list_head *list; - struct completion comp; - int ret; -}; - -/* - * The server has given us entries that intersect with our request's - * key range. Our caller is still blocked waiting for our completion. - * We walk the manifest entries and add native manifest refs to their - * list and wake them. - */ -static int manifest_range_entries_reply(struct super_block *sb, void *reply, - int reply_bytes, void *arg) -{ - struct manifest_range_entries_args *args = arg; - struct scoutfs_net_manifest_entries *ments = reply; - struct scoutfs_manifest_entry *ment; - unsigned bytes; - int ret = 0; - int i; - - if (reply_bytes < 0) { - ret = reply_bytes; - goto out; - } - - reply_bytes -= sizeof(struct scoutfs_net_manifest_entries); - if (reply_bytes < 0) { - ret = -EINVAL; - goto out; - } - - ment = ments->ments; - for (i = 0; i < le16_to_cpu(ments->nr); i++) { - - - if (reply_bytes < sizeof(struct scoutfs_manifest_entry)) { - ret = -EINVAL; - goto out; - } - - bytes = scoutfs_manifest_bytes(ment); - reply_bytes -= bytes; - if (reply_bytes < 0) { - ret = -EINVAL; - goto out; - } - - ret = scoutfs_manifest_add_ment_ref(sb, args->list, ment); - if (ret) - break; - - ment = (void *)((char *)ment + bytes); - } - -out: - args->ret = ret; - complete(&args->comp); /* args can be freed from this point */ - return ret; -} - -/* - * Ask the manifest server for the manifest entries whose key range - * intersects with the callers key range. The reply func will fill the - * caller's list with the reply's entries. - * - * XXX for now this can't be interrupted. The reply func which is off - * in work in a worker thread is blocking to allocate and put things on - * a list in our stack. We'd need better lifetime support to let it - * find out that we've returned and that it should stop processing the - * reply. - */ -int scoutfs_net_manifest_range_entries(struct super_block *sb, - struct scoutfs_key_buf *start, - struct scoutfs_key_buf *end, - struct list_head *list) -{ - struct manifest_range_entries_args args; - struct scoutfs_net_key_range *kr; - struct scoutfs_key_buf start_key; - struct scoutfs_key_buf end_key; - unsigned len; - int ret; - - len = sizeof(struct scoutfs_net_key_range) + - start->key_len + end->key_len; - kr = kmalloc(len, GFP_NOFS); - if (!kr) - return -ENOMEM; - - kr->start_len = cpu_to_le16(start->key_len); - kr->end_len = cpu_to_le16(end->key_len); - - scoutfs_key_init(&start_key, kr->key_bytes, start->key_len); - scoutfs_key_init(&end_key, kr->key_bytes + start->key_len, - end->key_len); - scoutfs_key_copy(&start_key, start); - scoutfs_key_copy(&end_key, end); - - args.list = list; - init_completion(&args.comp); - - ret = add_send_buf(sb, SCOUTFS_NET_MANIFEST_RANGE_ENTRIES, kr, len, - manifest_range_entries_reply, &args); - kfree(kr); - if (ret) - return ret; - - wait_for_completion(&args.comp); - return args.ret; -} - static int alloc_inodes_reply(struct super_block *sb, void *reply, int ret, void *arg) { diff --git a/kmod/src/net.h b/kmod/src/net.h index ea131144..bcfa34f9 100644 --- a/kmod/src/net.h +++ b/kmod/src/net.h @@ -5,10 +5,6 @@ struct scoutfs_key_buf; struct scoutfs_segment; int scoutfs_net_alloc_inodes(struct super_block *sb); -int scoutfs_net_manifest_range_entries(struct super_block *sb, - struct scoutfs_key_buf *start, - struct scoutfs_key_buf *end, - struct list_head *list); int scoutfs_net_alloc_segno(struct super_block *sb, u64 *segno); int scoutfs_net_record_segment(struct super_block *sb, struct scoutfs_segment *seg, u8 level); diff --git a/kmod/src/ring.c b/kmod/src/ring.c index 657bfe82..26e256f6 100644 --- a/kmod/src/ring.c +++ b/kmod/src/ring.c @@ -350,8 +350,7 @@ void *scoutfs_ring_prev(struct scoutfs_ring_info *ring, void *data) /* * Calculate the most blocks we could have to use to store a given number - * of bytes of entries. At worst each block has a header and leaves one - * less than the max manifest entry unused. + * of bytes of entries. */ static unsigned most_blocks(unsigned long bytes) { @@ -359,8 +358,7 @@ static unsigned most_blocks(unsigned long bytes) space = SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_ring_block) - - (sizeof(struct scoutfs_manifest_entry) + - (2 * SCOUTFS_MAX_KEY_SIZE) - 1); + sizeof(struct scoutfs_alloc_region); return DIV_ROUND_UP(bytes, space); } diff --git a/kmod/src/seg.c b/kmod/src/seg.c index fffbbf34..1cda3462 100644 --- a/kmod/src/seg.c +++ b/kmod/src/seg.c @@ -673,11 +673,8 @@ bool scoutfs_seg_append_item(struct super_block *sb, struct scoutfs_segment *seg return true; } -/* - * Add a dirty manifest entry for the given segment at the given level. - */ -int scoutfs_seg_manifest_add(struct super_block *sb, - struct scoutfs_segment *seg, u8 level) +void scoutfs_seg_init_ment(struct scoutfs_manifest_entry *ment, int level, + struct scoutfs_segment *seg) { struct scoutfs_segment_block *sblk = off_ptr(seg, 0); struct scoutfs_key_buf first; @@ -685,38 +682,8 @@ int scoutfs_seg_manifest_add(struct super_block *sb, first_last_keys(seg, &first, &last); - return scoutfs_manifest_add(sb, &first, &last, le64_to_cpu(sblk->segno), - le64_to_cpu(sblk->seq), level); -} - -int scoutfs_seg_manifest_del(struct super_block *sb, - struct scoutfs_segment *seg, u8 level) -{ - struct scoutfs_segment_block *sblk = off_ptr(seg, 0); - struct scoutfs_key_buf first; - - first_last_keys(seg, &first, NULL); - - return scoutfs_manifest_del(sb, &first, le64_to_cpu(sblk->seq), level); -} - -/* - * Return an allocated manifest entry that describes the segment, returns - * NULL if it couldn't allocate. - */ -struct scoutfs_manifest_entry * -scoutfs_seg_manifest_entry(struct super_block *sb, - struct scoutfs_segment *seg, u8 level) -{ - struct scoutfs_segment_block *sblk = off_ptr(seg, 0); - struct scoutfs_key_buf first; - struct scoutfs_key_buf last; - - first_last_keys(seg, &first, &last); - - return scoutfs_manifest_alloc_entry(sb, &first, &last, - le64_to_cpu(sblk->segno), - le64_to_cpu(sblk->seq), level); + scoutfs_manifest_init_entry(ment, level, le64_to_cpu(sblk->segno), + le64_to_cpu(sblk->seq), &first, &last); } /* diff --git a/kmod/src/seg.h b/kmod/src/seg.h index 9d1cd4c9..5a2909d4 100644 --- a/kmod/src/seg.h +++ b/kmod/src/seg.h @@ -3,6 +3,7 @@ struct scoutfs_bio_completion; struct scoutfs_key_buf; +struct scoutfs_manifest_entry; struct kvec; /* this is only visible for trace events */ @@ -39,19 +40,13 @@ bool scoutfs_seg_fits_single(u32 nr_items, u32 key_bytes, u32 val_bytes); bool scoutfs_seg_append_item(struct super_block *sb, struct scoutfs_segment *seg, struct scoutfs_key_buf *key, struct kvec *val, u8 flags, __le32 **links); -int scoutfs_seg_manifest_add(struct super_block *sb, - struct scoutfs_segment *seg, u8 level); -int scoutfs_seg_manifest_del(struct super_block *sb, - struct scoutfs_segment *seg, u8 level); +void scoutfs_seg_init_ment(struct scoutfs_manifest_entry *ment, int level, + struct scoutfs_segment *seg); int scoutfs_seg_submit_write(struct super_block *sb, struct scoutfs_segment *seg, struct scoutfs_bio_completion *comp); -struct scoutfs_manifest_entry * -scoutfs_seg_manifest_entry(struct super_block *sb, - struct scoutfs_segment *seg, u8 level); - int scoutfs_seg_setup(struct super_block *sb); void scoutfs_seg_destroy(struct super_block *sb);