Files
scoutfs/kmod/src/compact.c
Mark Fasheh 2c1f117bef scoutfs: replace trace_printk in compact.c
Signed-off-by: Mark Fasheh <mfasheh@versity.com>
2017-09-28 13:59:49 -07:00

669 lines
16 KiB
C

/*
* Copyright (C) 2017 Versity Software, Inc. All rights reserved.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include "super.h"
#include "format.h"
#include "kvec.h"
#include "seg.h"
#include "bio.h"
#include "cmp.h"
#include "compact.h"
#include "manifest.h"
#include "counters.h"
#include "alloc.h"
#include "server.h"
#include "scoutfs_trace.h"
/*
* Compaction is what maintains the exponentially increasing number of
* segments in each level of the lsm tree and is what merges duplicate
* and deletion keys.
*
* When the manifest is modified in a way that requires compaction it
* kicks the compaction thread. The compaction thread calls into the
* manifest to find the segments that need to be compaction.
*
* The compaction operation itself always involves a single "upper"
* segment at a given level and a limited number of "lower" segments at
* the next higher level whose key range intersects with the upper
* segment.
*
* Compaction proceeds by iterating over the items in the upper segment
* and items in each of the lower segments in sort order. The items
* from the two input segments are copied into new output segments in
* sorted order. Item space is reclaimed as duplicate or deletion items
* are removed.
*
* Once the compaction is completed the manifest is updated to remove
* the input segments and add the output segments. Here segment space
* is reclaimed when the input items fit in fewer output segments.
*/
struct compact_info {
struct super_block *sb;
struct workqueue_struct *workq;
struct work_struct work;
};
#define DECLARE_COMPACT_INFO(sb, name) \
struct compact_info *name = SCOUTFS_SB(sb)->compact_info
struct compact_seg {
struct list_head entry;
u64 segno;
u64 seq;
u8 level;
struct scoutfs_key_buf *first;
struct scoutfs_key_buf *last;
struct scoutfs_segment *seg;
int off;
bool part_of_move;
};
/*
* A compaction request. It's filled up in scoutfs_compact_add() as
* the manifest is wlaked and it finds segments involved in the compaction.
*/
struct compact_cursor {
struct list_head csegs;
/* buffer holds allocations and our returning them */
u64 segnos[SCOUTFS_COMPACTION_MAX_UPDATE];
unsigned nr_segnos;
u8 lower_level;
u8 last_level;
struct compact_seg *upper;
struct compact_seg *lower;
bool sticky;
struct compact_seg *last_lower;
__le32 *links[SCOUTFS_MAX_SKIP_LINKS];
};
static void free_cseg(struct super_block *sb, struct compact_seg *cseg)
{
WARN_ON_ONCE(!list_empty(&cseg->entry));
scoutfs_seg_put(cseg->seg);
scoutfs_key_free(sb, cseg->first);
scoutfs_key_free(sb, cseg->last);
kfree(cseg);
}
static struct compact_seg *alloc_cseg(struct super_block *sb,
struct scoutfs_key_buf *first,
struct scoutfs_key_buf *last)
{
struct compact_seg *cseg;
cseg = kzalloc(sizeof(struct compact_seg), GFP_NOFS);
if (cseg) {
INIT_LIST_HEAD(&cseg->entry);
cseg->first = scoutfs_key_dup(sb, first);
cseg->last = scoutfs_key_dup(sb, last);
if (!cseg->first || !cseg->last) {
free_cseg(sb, cseg);
cseg = NULL;
}
}
return cseg;
}
static void free_cseg_list(struct super_block *sb, struct list_head *list)
{
struct compact_seg *cseg;
struct compact_seg *tmp;
list_for_each_entry_safe(cseg, tmp, list, entry) {
list_del_init(&cseg->entry);
free_cseg(sb, cseg);
}
}
static int read_segment(struct super_block *sb, struct compact_seg *cseg)
{
struct scoutfs_segment *seg;
int ret;
if (cseg == NULL || cseg->seg)
return 0;
seg = scoutfs_seg_submit_read(sb, cseg->segno);
if (IS_ERR(seg)) {
ret = PTR_ERR(seg);
} else {
cseg->seg = seg;
scoutfs_inc_counter(sb, compact_segment_read);
ret = scoutfs_seg_wait(sb, cseg->seg);
}
/* XXX verify read segment metadata */
return ret;
}
static struct compact_seg *next_spos(struct compact_cursor *curs,
struct compact_seg *cseg)
{
if (cseg->entry.next == &curs->csegs)
return NULL;
return list_next_entry(cseg, entry);
}
/*
* Point the caller's key and value kvecs at the next item that should
* be copied from the upper or lower segments. We use the item that has
* the lowest key or the upper if they're the same. We advance the
* cursor past the item that is returned.
*
* XXX this will get fancier as we get range deletion items and
* incremental update items.
*/
static int next_item(struct super_block *sb, struct compact_cursor *curs,
struct scoutfs_key_buf *item_key, struct kvec *item_val,
u8 *item_flags)
{
struct compact_seg *upper = curs->upper;
struct compact_seg *lower = curs->lower;
struct scoutfs_key_buf lower_key;
SCOUTFS_DECLARE_KVEC(lower_val);
u8 lower_flags;
int cmp;
int ret;
retry:
if (upper) {
ret = scoutfs_seg_item_ptrs(upper->seg, upper->off,
item_key, item_val, item_flags);
if (ret < 0)
upper = NULL;
}
while (lower) {
ret = read_segment(sb, lower);
if (ret)
goto out;
ret = scoutfs_seg_item_ptrs(lower->seg, lower->off,
&lower_key, lower_val,
&lower_flags);
if (ret == 0)
break;
lower = next_spos(curs, lower);
}
/* we're done if all are empty */
if (!upper && !lower) {
ret = 0;
goto out;
}
/*
* < 0: return upper, advance upper
* == 0: return upper, advance both
* > 0: return lower, advance lower
*/
if (upper && lower)
cmp = scoutfs_key_compare(item_key, &lower_key);
else if (upper)
cmp = -1;
else
cmp = 1;
if (cmp > 0) {
scoutfs_key_clone(item_key, &lower_key);
scoutfs_kvec_clone(item_val, lower_val);
*item_flags = lower_flags;
}
/*
* If we have a sticky compaction then we can't mix items from
* the upper level past the last lower key into the lower level.
* The caller will notice when they're emptying the final upper
* level in a sticky merge and leave it at the upper level.
*/
if (curs->sticky && curs->lower &&
(!lower || lower == curs->last_lower) &&
scoutfs_key_compare(item_key, curs->last_lower->last) > 0) {
ret = 0;
goto out;
}
if (cmp <= 0)
upper->off = scoutfs_seg_next_off(upper->seg, upper->off);
if (cmp >= 0)
lower->off = scoutfs_seg_next_off(lower->seg, lower->off);
/*
* Deletion items make their way down all the levels, replacing
* all the duplicate items that they find. When we're
* compacting to the last level we can remove them by retrying
* the search after we've advanced past them.
*/
if ((curs->lower_level == curs->last_level) &&
((*item_flags) & SCOUTFS_ITEM_FLAG_DELETION))
goto retry;
ret = 1;
out:
curs->upper = upper;
curs->lower = lower;
return ret;
}
static int compact_segments(struct super_block *sb,
struct compact_cursor *curs,
struct scoutfs_bio_completion *comp,
struct list_head *results)
{
struct scoutfs_key_buf item_key;
SCOUTFS_DECLARE_KVEC(item_val);
struct scoutfs_segment *seg;
struct compact_seg *cseg;
struct compact_seg *upper;
struct compact_seg *lower;
unsigned next_segno = 0;
bool append_filled = false;
int ret = 0;
u8 flags;
scoutfs_inc_counter(sb, compact_operations);
if (curs->sticky)
scoutfs_inc_counter(sb, compact_sticky_upper);
while (curs->upper || curs->lower) {
upper = curs->upper;
lower = curs->lower;
/*
* If we're at the start of the upper segment and
* there's no lower segment then we might as well just
* move the segment in the manifest. We can't do this
* if we're moving to the last level because we might
* need to drop any deletion items.
*
* XXX We should have metadata in the manifest to tell
* us that there's no deletion items in the segment.
*/
if (upper && upper->off == 0 && !lower && !curs->sticky &&
((upper->level + 1) < curs->last_level)) {
/*
* XXX blah! these csegs are getting
* ridiculous. We should have a robust manifest
* entry iterator that reading and compacting
* can use.
*/
cseg = alloc_cseg(sb, upper->first, upper->last);
if (!cseg) {
ret = -ENOMEM;
break;
}
cseg->segno = upper->segno;
cseg->seq = upper->seq;
cseg->level = upper->level + 1;
cseg->seg = upper->seg;
if (cseg->seg)
scoutfs_seg_get(cseg->seg);
list_add_tail(&cseg->entry, results);
/* don't mess with its segno */
upper->part_of_move = true;
cseg->part_of_move = true;
curs->upper = NULL;
scoutfs_inc_counter(sb, compact_segment_moved);
break;
}
/* we're going to need its next key */
ret = read_segment(sb, upper);
if (ret)
break;
/*
* XXX we could intelligently skip reading and merging
* lower segments here. The lower segment won't change
* if:
* - the lower segment is entirely before the upper
* - the lower segment is full
*
* We don't have the metadata to determine that it's
* full today so we want to read lower segments that don't
* overlap so that we can merge partial lowers with
* its neighbours.
*/
ret = read_segment(sb, lower);
if (ret)
break;
if (!append_filled)
ret = next_item(sb, curs, &item_key, item_val, &flags);
else
ret = 1;
if (ret <= 0)
break;
/* no cseg keys, manifest update uses seg item keys */
cseg = kzalloc(sizeof(struct compact_seg), GFP_NOFS);
if (!cseg) {
ret = -ENOMEM;
break;
}
cseg->segno = curs->segnos[next_segno];
curs->segnos[next_segno] = 0;
next_segno++;
/*
* Compaction can free all the remaining items resulting
* in an empty output segment. We just free it in that
* case.
*/
ret = scoutfs_seg_alloc(sb, cseg->segno, &seg);
if (ret < 0) {
next_segno--;
curs->segnos[next_segno] = cseg->segno;
kfree(cseg);
scoutfs_seg_put(seg);
break;
}
/*
* The remaining upper items in a sticky merge have to
* be written into the upper level.
*/
if (curs->sticky && !lower) {
cseg->level = curs->lower_level - 1;
scoutfs_inc_counter(sb, compact_sticky_written);
} else {
cseg->level = curs->lower_level;
}
/* csegs will be claned up once they're on the list */
cseg->seg = seg;
list_add_tail(&cseg->entry, results);
for (;;) {
if (!scoutfs_seg_append_item(sb, seg, &item_key, item_val,
flags, curs->links)) {
append_filled = true;
ret = 0;
break;
}
ret = next_item(sb, curs, &item_key, item_val, &flags);
if (ret <= 0) {
append_filled = false;
break;
}
}
if (ret < 0)
break;
/* start a complete segment write now, we'll wait later */
ret = scoutfs_seg_submit_write(sb, seg, comp);
if (ret)
break;
scoutfs_inc_counter(sb, compact_segment_written);
}
return ret;
}
/*
* Manifest walking is providing the details of the overall compaction
* operation.
*/
void scoutfs_compact_describe(struct super_block *sb, void *data,
u8 upper_level, u8 last_level, bool sticky)
{
struct compact_cursor *curs = data;
curs->lower_level = upper_level + 1;
curs->last_level = last_level;
curs->sticky = sticky;
}
/*
* Add a segment involved in the compaction operation.
*
* XXX Today we know that the caller is always adding only one upper segment
* and is then possibly adding all the lower overlapping segments.
*/
int scoutfs_compact_add(struct super_block *sb, void *data,
struct scoutfs_manifest_entry *ment)
{
struct compact_cursor *curs = data;
struct compact_seg *cseg;
int ret;
cseg = alloc_cseg(sb, &ment->first, &ment->last);
if (!cseg) {
ret = -ENOMEM;
goto out;
}
list_add_tail(&cseg->entry, &curs->csegs);
cseg->segno = ment->segno;
cseg->seq = ment->seq;
cseg->level = ment->level;
if (!curs->upper)
curs->upper = cseg;
else if (!curs->lower)
curs->lower = cseg;
if (curs->lower)
curs->last_lower = cseg;
ret = 0;
out:
return ret;
}
/*
* Give the compaction cursor a segno to allocate from.
*/
void scoutfs_compact_add_segno(struct super_block *sb, void *data, u64 segno)
{
struct compact_cursor *curs = data;
curs->segnos[curs->nr_segnos++] = segno;
}
/*
* Commit the result of a compaction based on the state of the cursor.
* The net caller stops the manifest from being written while we're
* making changes. We lock the manifest to atomically make our changes.
*
* The erorr handling is sketchy here because calling the manifest from
* here is temporary. We should be sending a message to the server
* instead of calling the allocator and manifest.
*/
int scoutfs_compact_commit(struct super_block *sb, void *c, void *r)
{
struct scoutfs_manifest_entry ment;
struct compact_cursor *curs = c;
struct list_head *results = r;
struct compact_seg *cseg;
int ret;
int i;
/* free unused segnos that were allocated for the compaction */
for (i = 0; i < curs->nr_segnos; i++) {
if (curs->segnos[i]) {
ret = scoutfs_alloc_free(sb, curs->segnos[i]);
BUG_ON(ret);
}
}
scoutfs_manifest_lock(sb);
/* delete input segments, probably freeing their segnos */
list_for_each_entry(cseg, &curs->csegs, entry) {
if (!cseg->part_of_move) {
ret = scoutfs_alloc_free(sb, cseg->segno);
BUG_ON(ret);
}
scoutfs_manifest_init_entry(&ment, cseg->level, 0, cseg->seq,
cseg->first, NULL);
ret = scoutfs_manifest_del(sb, &ment);
BUG_ON(ret);
}
/* add output entries */
list_for_each_entry(cseg, results, entry) {
/* XXX moved upper segments won't have read the segment :P */
if (cseg->seg)
scoutfs_seg_init_ment(&ment, cseg->level, cseg->seg);
else
scoutfs_manifest_init_entry(&ment, cseg->level,
cseg->segno, cseg->seq,
cseg->first, cseg->last);
ret = scoutfs_manifest_add(sb, &ment);
BUG_ON(ret);
}
scoutfs_manifest_unlock(sb);
return 0;
}
/*
* The compaction worker tries to make forward progress with compaction
* every time its kicked. It pretends to send a message requesting
* compaction parameters but in reality the net request function there
* is calling directly into the manifest and back into our compaction
* add routines.
*
* We always try to clean up everything on errors.
*/
static void scoutfs_compact_func(struct work_struct *work)
{
struct compact_info *ci = container_of(work, struct compact_info, work);
struct super_block *sb = ci->sb;
struct compact_cursor curs = {{NULL,}};
struct scoutfs_bio_completion comp;
struct compact_seg *cseg;
LIST_HEAD(results);
int ret;
int err;
INIT_LIST_HEAD(&curs.csegs);
scoutfs_bio_init_comp(&comp);
ret = scoutfs_client_get_compaction(sb, (void *)&curs);
/* short circuit no compaction work to do */
if (ret == 0 && list_empty(&curs.csegs))
return;
/* trace compaction ranges */
list_for_each_entry(cseg, &curs.csegs, entry) {
trace_scoutfs_compact_input(sb, cseg->level, cseg->segno,
cseg->seq, cseg->first, cseg->last);
}
if (ret == 0 && !list_empty(&curs.csegs)) {
ret = compact_segments(sb, &curs, &comp, &results);
/* always wait for io completion */
err = scoutfs_bio_wait_comp(sb, &comp);
if (!ret && err)
ret = err;
}
/* don't update manifest on error, just free segnos */
if (ret) {
list_for_each_entry(cseg, &results, entry) {
if (!cseg->part_of_move)
curs.segnos[curs.nr_segnos++] = cseg->segno;
}
free_cseg_list(sb, &curs.csegs);
free_cseg_list(sb, &results);
}
err = scoutfs_client_finish_compaction(sb, &curs, &results);
if (!ret && err)
ret = err;
free_cseg_list(sb, &curs.csegs);
free_cseg_list(sb, &results);
WARN_ON_ONCE(ret);
trace_scoutfs_compact_func(sb, ret);
}
void scoutfs_compact_kick(struct super_block *sb)
{
DECLARE_COMPACT_INFO(sb, ci);
queue_work(ci->workq, &ci->work);
}
int scoutfs_compact_setup(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct compact_info *ci;
ci = kzalloc(sizeof(struct compact_info), GFP_KERNEL);
if (!ci)
return -ENOMEM;
ci->sb = sb;
INIT_WORK(&ci->work, scoutfs_compact_func);
ci->workq = alloc_workqueue("scoutfs_compact", 0, 1);
if (!ci->workq) {
kfree(ci);
return -ENOMEM;
}
sbi->compact_info = ci;
return 0;
}
/*
* The system should be idle, there should not be any more manifest
* modification which would kick compaction.
*/
void scoutfs_compact_destroy(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
DECLARE_COMPACT_INFO(sb, ci);
if (ci) {
flush_work(&ci->work);
destroy_workqueue(ci->workq);
sbi->compact_info = NULL;
}
}