mirror of
https://github.com/versity/scoutfs.git
synced 2026-02-08 03:30:46 +00:00
scoutfs: add btree stored in persistent ring
Add a cow btree whose blocks are stored in a persistently allocated ring. This will let us incrementally index very large data sets efficiently. This is an adaptation of the previous btree code which now uses the ring, stores variable length keys, and augments the items with bits that ored up through parents. Signed-off-by: Zach Brown <zab@versity.com>
This commit is contained in:
@@ -2,6 +2,7 @@ obj-$(CONFIG_SCOUTFS_FS) := scoutfs.o
|
||||
|
||||
CFLAGS_scoutfs_trace.o = -I$(src) # define_trace.h double include
|
||||
|
||||
scoutfs-y += alloc.o bio.o compact.o counters.o data.o dir.o kvec.o inode.o \
|
||||
ioctl.o item.o key.o lock.o manifest.o msg.o net.o options.o \
|
||||
ring.o seg.o scoutfs_trace.o super.o trans.o xattr.o
|
||||
scoutfs-y += alloc.o bio.o btree.o compact.o counters.o data.o dir.o kvec.o \
|
||||
inode.o ioctl.o item.o key.o lock.o manifest.o msg.o net.o \
|
||||
options.o ring.o seg.o scoutfs_trace.o sort_priv.o super.o trans.o \
|
||||
xattr.o
|
||||
|
||||
1878
kmod/src/btree.c
Normal file
1878
kmod/src/btree.c
Normal file
File diff suppressed because it is too large
Load Diff
51
kmod/src/btree.h
Normal file
51
kmod/src/btree.h
Normal file
@@ -0,0 +1,51 @@
|
||||
#ifndef _SCOUTFS_BTREE_H_
|
||||
#define _SCOUTFS_BTREE_H_
|
||||
|
||||
#include <linux/uio.h>
|
||||
|
||||
struct scoutfs_btree_item_ref {
|
||||
void *key;
|
||||
unsigned key_len;
|
||||
void *val;
|
||||
unsigned val_len;
|
||||
};
|
||||
|
||||
#define SCOUTFS_BTREE_ITEM_REF(name) \
|
||||
struct scoutfs_btree_item_ref name = {NULL,}
|
||||
|
||||
int scoutfs_btree_lookup(struct super_block *sb, struct scoutfs_btree_root *root,
|
||||
void *key, unsigned key_len,
|
||||
struct scoutfs_btree_item_ref *iref);
|
||||
int scoutfs_btree_insert(struct super_block *sb, struct scoutfs_btree_root *root,
|
||||
void *key, unsigned key_len,
|
||||
void *val, unsigned val_len);
|
||||
int scoutfs_btree_update(struct super_block *sb, struct scoutfs_btree_root *root,
|
||||
void *key, unsigned key_len,
|
||||
void *val, unsigned val_len);
|
||||
int scoutfs_btree_delete(struct super_block *sb, struct scoutfs_btree_root *root,
|
||||
void *key, unsigned key_len);
|
||||
int scoutfs_btree_next(struct super_block *sb, struct scoutfs_btree_root *root,
|
||||
void *key, unsigned key_len,
|
||||
struct scoutfs_btree_item_ref *iref);
|
||||
int scoutfs_btree_after(struct super_block *sb, struct scoutfs_btree_root *root,
|
||||
void *key, unsigned key_len,
|
||||
struct scoutfs_btree_item_ref *iref);
|
||||
int scoutfs_btree_prev(struct super_block *sb, struct scoutfs_btree_root *root,
|
||||
void *key, unsigned key_len,
|
||||
struct scoutfs_btree_item_ref *iref);
|
||||
int scoutfs_btree_before(struct super_block *sb, struct scoutfs_btree_root *root,
|
||||
void *key, unsigned key_len,
|
||||
struct scoutfs_btree_item_ref *iref);
|
||||
int scoutfs_btree_dirty(struct super_block *sb, struct scoutfs_btree_root *root,
|
||||
void *key, unsigned key_len);
|
||||
|
||||
void scoutfs_btree_put_iref(struct scoutfs_btree_item_ref *iref);
|
||||
|
||||
bool scoutfs_btree_has_dirty(struct super_block *sb);
|
||||
int scoutfs_btree_write_dirty(struct super_block *sb);
|
||||
void scoutfs_btree_write_complete(struct super_block *sb);
|
||||
|
||||
int scoutfs_btree_setup(struct super_block *sb);
|
||||
void scoutfs_btree_destroy(struct super_block *sb);
|
||||
|
||||
#endif
|
||||
@@ -76,6 +76,90 @@ struct scoutfs_ring_descriptor {
|
||||
__le64 nr_blocks;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* Assert that we'll be able to represent all possible keys with 8 64bit
|
||||
* primary sort values.
|
||||
*/
|
||||
#define SCOUTFS_BTREE_GREATEST_KEY_LEN 32
|
||||
/* level >0 segments can have a full key and some metadata */
|
||||
#define SCOUTFS_BTREE_MAX_KEY_LEN 320
|
||||
/* level 0 segments can have two full keys in the value :/ */
|
||||
#define SCOUTFS_BTREE_MAX_VAL_LEN 768
|
||||
|
||||
/*
|
||||
* A 4EB test image measured a worst case height of 17. This is plenty
|
||||
* generous.
|
||||
*/
|
||||
#define SCOUTFS_BTREE_MAX_HEIGHT 20
|
||||
|
||||
/* btree blocks (beyond the first) need to be at least half full */
|
||||
#define SCOUTFS_BTREE_FREE_LIMIT \
|
||||
((SCOUTFS_BLOCK_SIZE - sizeof(struct scoutfs_btree_block)) / 2)
|
||||
|
||||
#define SCOUTFS_BTREE_BITS 8
|
||||
|
||||
/*
|
||||
* Btree items can have bits associated with them. Their parent items
|
||||
* reflect all the bits that their child block contain. Thus searches
|
||||
* can find items with bits set.
|
||||
*
|
||||
* @SCOUTFS_BTREE_BIT_HALF1: Tracks blocks found in the first half of
|
||||
* the ring. It's used to migrate blocks from the old half of the ring
|
||||
* into the current half as blocks are dirtied. It's not found in leaf
|
||||
* items but is calculated based on the block number of referenced
|
||||
* blocks. _HALF2 is identical but for the second half of the ring.
|
||||
*/
|
||||
enum {
|
||||
SCOUTFS_BTREE_BIT_HALF1 = (1 << 0),
|
||||
SCOUTFS_BTREE_BIT_HALF2 = (1 << 1),
|
||||
};
|
||||
|
||||
struct scoutfs_btree_ref {
|
||||
__le64 blkno;
|
||||
__le64 seq;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* A height of X means that the first block read will have level X-1 and
|
||||
* the leaves will have level 0.
|
||||
*/
|
||||
struct scoutfs_btree_root {
|
||||
struct scoutfs_btree_ref ref;
|
||||
__u8 height;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_btree_item_header {
|
||||
__le16 off;
|
||||
__u8 bits;
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_btree_item {
|
||||
__le16 key_len;
|
||||
__le16 val_len;
|
||||
__u8 data[0];
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_btree_block {
|
||||
__le64 fsid;
|
||||
__le64 blkno;
|
||||
__le64 seq;
|
||||
__le32 crc;
|
||||
__le32 _pad;
|
||||
__le16 free_end;
|
||||
__le16 free_reclaim;
|
||||
__le16 nr_items;
|
||||
__le16 bit_counts[SCOUTFS_BTREE_BITS];
|
||||
__u8 level;
|
||||
struct scoutfs_btree_item_header item_hdrs[0];
|
||||
} __packed;
|
||||
|
||||
struct scoutfs_btree_ring {
|
||||
__le64 first_blkno;
|
||||
__le64 nr_blocks;
|
||||
__le64 next_block;
|
||||
__le64 next_seq;
|
||||
} __packed;
|
||||
|
||||
/*
|
||||
* This is absurdly huge. If there was only ever 1 item per segment and
|
||||
* 2^64 items the tree could get this deep.
|
||||
@@ -313,6 +397,7 @@ struct scoutfs_super_block {
|
||||
__le64 ring_blocks;
|
||||
__le64 ring_tail_block;
|
||||
__le64 ring_gen;
|
||||
struct scoutfs_btree_ring bring;
|
||||
__le64 next_seg_seq;
|
||||
struct scoutfs_ring_descriptor alloc_ring;
|
||||
struct scoutfs_manifest manifest;
|
||||
|
||||
71
kmod/src/sort_priv.c
Normal file
71
kmod/src/sort_priv.c
Normal file
@@ -0,0 +1,71 @@
|
||||
/*
|
||||
* A copy of sort() from upstream with a priv argument that's passed
|
||||
* to comparison, like list_sort().
|
||||
*/
|
||||
|
||||
/* ------------------------ */
|
||||
|
||||
/*
|
||||
* A fast, small, non-recursive O(nlog n) sort for the Linux kernel
|
||||
*
|
||||
* Jan 23 2005 Matt Mackall <mpm@selenic.com>
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/sort.h>
|
||||
#include <linux/slab.h>
|
||||
#include "sort_priv.h"
|
||||
|
||||
/**
|
||||
* sort - sort an array of elements
|
||||
* @priv: caller's pointer to pass to comparison and swap functions
|
||||
* @base: pointer to data to sort
|
||||
* @num: number of elements
|
||||
* @size: size of each element
|
||||
* @cmp_func: pointer to comparison function
|
||||
* @swap_func: pointer to swap function or NULL
|
||||
*
|
||||
* This function does a heapsort on the given array. You may provide a
|
||||
* swap_func function optimized to your element type.
|
||||
*
|
||||
* Sorting time is O(n log n) both on average and worst-case. While
|
||||
* qsort is about 20% faster on average, it suffers from exploitable
|
||||
* O(n*n) worst-case behavior and extra memory requirements that make
|
||||
* it less suitable for kernel use.
|
||||
*/
|
||||
|
||||
void sort_priv(void *priv, void *base, size_t num, size_t size,
|
||||
int (*cmp_func)(void *priv, const void *, const void *),
|
||||
void (*swap_func)(void *priv, void *, void *, int size))
|
||||
{
|
||||
/* pre-scale counters for performance */
|
||||
int i = (num/2 - 1) * size, n = num * size, c, r;
|
||||
|
||||
/* heapify */
|
||||
for ( ; i >= 0; i -= size) {
|
||||
for (r = i; r * 2 + size < n; r = c) {
|
||||
c = r * 2 + size;
|
||||
if (c < n - size &&
|
||||
cmp_func(priv, base + c, base + c + size) < 0)
|
||||
c += size;
|
||||
if (cmp_func(priv, base + r, base + c) >= 0)
|
||||
break;
|
||||
swap_func(priv, base + r, base + c, size);
|
||||
}
|
||||
}
|
||||
|
||||
/* sort */
|
||||
for (i = n - size; i > 0; i -= size) {
|
||||
swap_func(priv, base, base + i, size);
|
||||
for (r = 0; r * 2 + size < i; r = c) {
|
||||
c = r * 2 + size;
|
||||
if (c < i - size &&
|
||||
cmp_func(priv, base + c, base + c + size) < 0)
|
||||
c += size;
|
||||
if (cmp_func(priv, base + r, base + c) >= 0)
|
||||
break;
|
||||
swap_func(priv, base + r, base + c, size);
|
||||
}
|
||||
}
|
||||
}
|
||||
8
kmod/src/sort_priv.h
Normal file
8
kmod/src/sort_priv.h
Normal file
@@ -0,0 +1,8 @@
|
||||
#ifndef _SCOUTFS_SORT_PRIV_H_
|
||||
#define _SCOUTFS_SORT_PRIV_H_
|
||||
|
||||
void sort_priv(void *priv, void *base, size_t num, size_t size,
|
||||
int (*cmp_func)(void *priv, const void *, const void *),
|
||||
void (*swap_func)(void *priv, void *, void *, int size));
|
||||
|
||||
#endif
|
||||
@@ -203,6 +203,9 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
|
||||
sb->s_maxbytes = MAX_LFS_FILESIZE;
|
||||
sb->s_op = &scoutfs_super_ops;
|
||||
|
||||
/* btree blocks use long lived bh->b_data refs */
|
||||
mapping_set_gfp_mask(sb->s_bdev->bd_inode->i_mapping, GFP_NOFS);
|
||||
|
||||
sbi = kzalloc(sizeof(struct scoutfs_sb_info), GFP_KERNEL);
|
||||
sb->s_fs_info = sbi;
|
||||
sbi->sb = sb;
|
||||
|
||||
@@ -17,6 +17,7 @@ struct trans_info;
|
||||
struct lock_info;
|
||||
struct net_info;
|
||||
struct inode_sb_info;
|
||||
struct btree_info;
|
||||
|
||||
struct scoutfs_sb_info {
|
||||
struct super_block *sb;
|
||||
@@ -34,6 +35,7 @@ struct scoutfs_sb_info {
|
||||
struct compact_info *compact_info;
|
||||
struct data_info *data_info;
|
||||
struct inode_sb_info *inode_sb_info;
|
||||
struct btree_info *btree_info;
|
||||
|
||||
wait_queue_head_t trans_hold_wq;
|
||||
struct task_struct *trans_task;
|
||||
|
||||
Reference in New Issue
Block a user