/* * Copyright (C) 2017 Versity Software, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License v2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ #include #include #include #include "super.h" #include "format.h" #include "kvec.h" #include "seg.h" #include "bio.h" #include "cmp.h" #include "compact.h" #include "manifest.h" #include "counters.h" #include "alloc.h" #include "server.h" #include "scoutfs_trace.h" /* * Compaction is what maintains the exponentially increasing number of * segments in each level of the lsm tree and is what merges duplicate * and deletion keys. * * When the manifest is modified in a way that requires compaction it * kicks the compaction thread. The compaction thread calls into the * manifest to find the segments that need to be compaction. * * The compaction operation itself always involves a single "upper" * segment at a given level and a limited number of "lower" segments at * the next higher level whose key range intersects with the upper * segment. * * Compaction proceeds by iterating over the items in the upper segment * and items in each of the lower segments in sort order. The items * from the two input segments are copied into new output segments in * sorted order. Item space is reclaimed as duplicate or deletion items * are removed. * * Once the compaction is completed the manifest is updated to remove * the input segments and add the output segments. Here segment space * is reclaimed when the input items fit in fewer output segments. */ struct compact_info { struct super_block *sb; struct workqueue_struct *workq; struct work_struct work; }; #define DECLARE_COMPACT_INFO(sb, name) \ struct compact_info *name = SCOUTFS_SB(sb)->compact_info struct compact_seg { struct list_head entry; u64 segno; u64 seq; u8 level; struct scoutfs_key_buf *first; struct scoutfs_key_buf *last; struct scoutfs_segment *seg; int off; bool part_of_move; }; /* * A compaction request. It's filled up in scoutfs_compact_add() as * the manifest is wlaked and it finds segments involved in the compaction. */ struct compact_cursor { struct list_head csegs; /* buffer holds allocations and our returning them */ u64 segnos[SCOUTFS_COMPACTION_MAX_UPDATE]; unsigned nr_segnos; u8 lower_level; u8 last_level; struct compact_seg *upper; struct compact_seg *lower; bool sticky; struct compact_seg *last_lower; __le32 *links[SCOUTFS_MAX_SKIP_LINKS]; }; static void free_cseg(struct super_block *sb, struct compact_seg *cseg) { WARN_ON_ONCE(!list_empty(&cseg->entry)); scoutfs_seg_put(cseg->seg); scoutfs_key_free(sb, cseg->first); scoutfs_key_free(sb, cseg->last); kfree(cseg); } static struct compact_seg *alloc_cseg(struct super_block *sb, struct scoutfs_key_buf *first, struct scoutfs_key_buf *last) { struct compact_seg *cseg; cseg = kzalloc(sizeof(struct compact_seg), GFP_NOFS); if (cseg) { INIT_LIST_HEAD(&cseg->entry); cseg->first = scoutfs_key_dup(sb, first); cseg->last = scoutfs_key_dup(sb, last); if (!cseg->first || !cseg->last) { free_cseg(sb, cseg); cseg = NULL; } } return cseg; } static void free_cseg_list(struct super_block *sb, struct list_head *list) { struct compact_seg *cseg; struct compact_seg *tmp; list_for_each_entry_safe(cseg, tmp, list, entry) { list_del_init(&cseg->entry); free_cseg(sb, cseg); } } static int read_segment(struct super_block *sb, struct compact_seg *cseg) { struct scoutfs_segment *seg; int ret; if (cseg == NULL || cseg->seg) return 0; seg = scoutfs_seg_submit_read(sb, cseg->segno); if (IS_ERR(seg)) { ret = PTR_ERR(seg); } else { cseg->seg = seg; scoutfs_inc_counter(sb, compact_segment_read); ret = scoutfs_seg_wait(sb, cseg->seg); } /* XXX verify read segment metadata */ return ret; } static struct compact_seg *next_spos(struct compact_cursor *curs, struct compact_seg *cseg) { if (cseg->entry.next == &curs->csegs) return NULL; return list_next_entry(cseg, entry); } /* * Point the caller's key and value kvecs at the next item that should * be copied from the upper or lower segments. We use the item that has * the lowest key or the upper if they're the same. We advance the * cursor past the item that is returned. * * XXX this will get fancier as we get range deletion items and * incremental update items. */ static int next_item(struct super_block *sb, struct compact_cursor *curs, struct scoutfs_key_buf *item_key, struct kvec *item_val, u8 *item_flags) { struct compact_seg *upper = curs->upper; struct compact_seg *lower = curs->lower; struct scoutfs_key_buf lower_key; SCOUTFS_DECLARE_KVEC(lower_val); u8 lower_flags; int cmp; int ret; retry: if (upper) { ret = scoutfs_seg_item_ptrs(upper->seg, upper->off, item_key, item_val, item_flags); if (ret < 0) upper = NULL; } while (lower) { ret = read_segment(sb, lower); if (ret) goto out; ret = scoutfs_seg_item_ptrs(lower->seg, lower->off, &lower_key, lower_val, &lower_flags); if (ret == 0) break; lower = next_spos(curs, lower); } /* we're done if all are empty */ if (!upper && !lower) { ret = 0; goto out; } /* * < 0: return upper, advance upper * == 0: return upper, advance both * > 0: return lower, advance lower */ if (upper && lower) cmp = scoutfs_key_compare(item_key, &lower_key); else if (upper) cmp = -1; else cmp = 1; if (cmp > 0) { scoutfs_key_clone(item_key, &lower_key); scoutfs_kvec_clone(item_val, lower_val); *item_flags = lower_flags; } /* * If we have a sticky compaction then we can't mix items from * the upper level past the last lower key into the lower level. * The caller will notice when they're emptying the final upper * level in a sticky merge and leave it at the upper level. */ if (curs->sticky && curs->lower && (!lower || lower == curs->last_lower) && scoutfs_key_compare(item_key, curs->last_lower->last) > 0) { ret = 0; goto out; } if (cmp <= 0) upper->off = scoutfs_seg_next_off(upper->seg, upper->off); if (cmp >= 0) lower->off = scoutfs_seg_next_off(lower->seg, lower->off); /* * Deletion items make their way down all the levels, replacing * all the duplicate items that they find. When we're * compacting to the last level we can remove them by retrying * the search after we've advanced past them. */ if ((curs->lower_level == curs->last_level) && ((*item_flags) & SCOUTFS_ITEM_FLAG_DELETION)) goto retry; ret = 1; out: curs->upper = upper; curs->lower = lower; return ret; } static int compact_segments(struct super_block *sb, struct compact_cursor *curs, struct scoutfs_bio_completion *comp, struct list_head *results) { struct scoutfs_key_buf item_key; SCOUTFS_DECLARE_KVEC(item_val); struct scoutfs_segment *seg; struct compact_seg *cseg; struct compact_seg *upper; struct compact_seg *lower; unsigned next_segno = 0; bool append_filled = false; int ret = 0; u8 flags; scoutfs_inc_counter(sb, compact_operations); if (curs->sticky) scoutfs_inc_counter(sb, compact_sticky_upper); while (curs->upper || curs->lower) { upper = curs->upper; lower = curs->lower; /* * If we're at the start of the upper segment and * there's no lower segment then we might as well just * move the segment in the manifest. We can't do this * if we're moving to the last level because we might * need to drop any deletion items. * * XXX We should have metadata in the manifest to tell * us that there's no deletion items in the segment. */ if (upper && upper->off == 0 && !lower && !curs->sticky && ((upper->level + 1) < curs->last_level)) { /* * XXX blah! these csegs are getting * ridiculous. We should have a robust manifest * entry iterator that reading and compacting * can use. */ cseg = alloc_cseg(sb, upper->first, upper->last); if (!cseg) { ret = -ENOMEM; break; } cseg->segno = upper->segno; cseg->seq = upper->seq; cseg->level = upper->level + 1; cseg->seg = upper->seg; if (cseg->seg) scoutfs_seg_get(cseg->seg); list_add_tail(&cseg->entry, results); /* don't mess with its segno */ upper->part_of_move = true; cseg->part_of_move = true; curs->upper = NULL; scoutfs_inc_counter(sb, compact_segment_moved); break; } /* we're going to need its next key */ ret = read_segment(sb, upper); if (ret) break; /* * XXX we could intelligently skip reading and merging * lower segments here. The lower segment won't change * if: * - the lower segment is entirely before the upper * - the lower segment is full * * We don't have the metadata to determine that it's * full today so we want to read lower segments that don't * overlap so that we can merge partial lowers with * its neighbours. */ ret = read_segment(sb, lower); if (ret) break; if (!append_filled) ret = next_item(sb, curs, &item_key, item_val, &flags); else ret = 1; if (ret <= 0) break; /* no cseg keys, manifest update uses seg item keys */ cseg = kzalloc(sizeof(struct compact_seg), GFP_NOFS); if (!cseg) { ret = -ENOMEM; break; } cseg->segno = curs->segnos[next_segno]; curs->segnos[next_segno] = 0; next_segno++; /* * Compaction can free all the remaining items resulting * in an empty output segment. We just free it in that * case. */ ret = scoutfs_seg_alloc(sb, cseg->segno, &seg); if (ret < 0) { next_segno--; curs->segnos[next_segno] = cseg->segno; kfree(cseg); scoutfs_seg_put(seg); break; } /* * The remaining upper items in a sticky merge have to * be written into the upper level. */ if (curs->sticky && !lower) { cseg->level = curs->lower_level - 1; scoutfs_inc_counter(sb, compact_sticky_written); } else { cseg->level = curs->lower_level; } /* csegs will be claned up once they're on the list */ cseg->seg = seg; list_add_tail(&cseg->entry, results); for (;;) { if (!scoutfs_seg_append_item(sb, seg, &item_key, item_val, flags, curs->links)) { append_filled = true; ret = 0; break; } ret = next_item(sb, curs, &item_key, item_val, &flags); if (ret <= 0) { append_filled = false; break; } } if (ret < 0) break; /* start a complete segment write now, we'll wait later */ ret = scoutfs_seg_submit_write(sb, seg, comp); if (ret) break; scoutfs_inc_counter(sb, compact_segment_written); } return ret; } /* * Manifest walking is providing the details of the overall compaction * operation. */ void scoutfs_compact_describe(struct super_block *sb, void *data, u8 upper_level, u8 last_level, bool sticky) { struct compact_cursor *curs = data; curs->lower_level = upper_level + 1; curs->last_level = last_level; curs->sticky = sticky; } /* * Add a segment involved in the compaction operation. * * XXX Today we know that the caller is always adding only one upper segment * and is then possibly adding all the lower overlapping segments. */ int scoutfs_compact_add(struct super_block *sb, void *data, struct scoutfs_manifest_entry *ment) { struct compact_cursor *curs = data; struct compact_seg *cseg; int ret; cseg = alloc_cseg(sb, &ment->first, &ment->last); if (!cseg) { ret = -ENOMEM; goto out; } list_add_tail(&cseg->entry, &curs->csegs); cseg->segno = ment->segno; cseg->seq = ment->seq; cseg->level = ment->level; if (!curs->upper) curs->upper = cseg; else if (!curs->lower) curs->lower = cseg; if (curs->lower) curs->last_lower = cseg; ret = 0; out: return ret; } /* * Give the compaction cursor a segno to allocate from. */ void scoutfs_compact_add_segno(struct super_block *sb, void *data, u64 segno) { struct compact_cursor *curs = data; curs->segnos[curs->nr_segnos++] = segno; } /* * Commit the result of a compaction based on the state of the cursor. * The net caller stops the manifest from being written while we're * making changes. We lock the manifest to atomically make our changes. * * The erorr handling is sketchy here because calling the manifest from * here is temporary. We should be sending a message to the server * instead of calling the allocator and manifest. */ int scoutfs_compact_commit(struct super_block *sb, void *c, void *r) { struct scoutfs_manifest_entry ment; struct compact_cursor *curs = c; struct list_head *results = r; struct compact_seg *cseg; int ret; int i; /* free unused segnos that were allocated for the compaction */ for (i = 0; i < curs->nr_segnos; i++) { if (curs->segnos[i]) { ret = scoutfs_alloc_free(sb, curs->segnos[i]); BUG_ON(ret); } } scoutfs_manifest_lock(sb); /* delete input segments, probably freeing their segnos */ list_for_each_entry(cseg, &curs->csegs, entry) { if (!cseg->part_of_move) { ret = scoutfs_alloc_free(sb, cseg->segno); BUG_ON(ret); } scoutfs_manifest_init_entry(&ment, cseg->level, 0, cseg->seq, cseg->first, NULL); ret = scoutfs_manifest_del(sb, &ment); BUG_ON(ret); } /* add output entries */ list_for_each_entry(cseg, results, entry) { /* XXX moved upper segments won't have read the segment :P */ if (cseg->seg) scoutfs_seg_init_ment(&ment, cseg->level, cseg->seg); else scoutfs_manifest_init_entry(&ment, cseg->level, cseg->segno, cseg->seq, cseg->first, cseg->last); ret = scoutfs_manifest_add(sb, &ment); BUG_ON(ret); } scoutfs_manifest_unlock(sb); return 0; } /* * The compaction worker tries to make forward progress with compaction * every time its kicked. It pretends to send a message requesting * compaction parameters but in reality the net request function there * is calling directly into the manifest and back into our compaction * add routines. * * We always try to clean up everything on errors. */ static void scoutfs_compact_func(struct work_struct *work) { struct compact_info *ci = container_of(work, struct compact_info, work); struct super_block *sb = ci->sb; struct compact_cursor curs = {{NULL,}}; struct scoutfs_bio_completion comp; struct compact_seg *cseg; LIST_HEAD(results); int ret; int err; INIT_LIST_HEAD(&curs.csegs); scoutfs_bio_init_comp(&comp); ret = scoutfs_client_get_compaction(sb, (void *)&curs); /* short circuit no compaction work to do */ if (ret == 0 && list_empty(&curs.csegs)) return; /* trace compaction ranges */ list_for_each_entry(cseg, &curs.csegs, entry) { trace_scoutfs_compact_input(sb, cseg->level, cseg->segno, cseg->seq, cseg->first, cseg->last); } if (ret == 0 && !list_empty(&curs.csegs)) { ret = compact_segments(sb, &curs, &comp, &results); /* always wait for io completion */ err = scoutfs_bio_wait_comp(sb, &comp); if (!ret && err) ret = err; } /* don't update manifest on error, just free segnos */ if (ret) { list_for_each_entry(cseg, &results, entry) { if (!cseg->part_of_move) curs.segnos[curs.nr_segnos++] = cseg->segno; } free_cseg_list(sb, &curs.csegs); free_cseg_list(sb, &results); } err = scoutfs_client_finish_compaction(sb, &curs, &results); if (!ret && err) ret = err; free_cseg_list(sb, &curs.csegs); free_cseg_list(sb, &results); WARN_ON_ONCE(ret); trace_scoutfs_compact_func(sb, ret); } void scoutfs_compact_kick(struct super_block *sb) { DECLARE_COMPACT_INFO(sb, ci); queue_work(ci->workq, &ci->work); } int scoutfs_compact_setup(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); struct compact_info *ci; ci = kzalloc(sizeof(struct compact_info), GFP_KERNEL); if (!ci) return -ENOMEM; ci->sb = sb; INIT_WORK(&ci->work, scoutfs_compact_func); ci->workq = alloc_workqueue("scoutfs_compact", 0, 1); if (!ci->workq) { kfree(ci); return -ENOMEM; } sbi->compact_info = ci; return 0; } /* * The system should be idle, there should not be any more manifest * modification which would kick compaction. */ void scoutfs_compact_destroy(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); DECLARE_COMPACT_INFO(sb, ci); if (ci) { flush_work(&ci->work); destroy_workqueue(ci->workq); sbi->compact_info = NULL; } }