scoutfs: Use separate block devices for metadata and data

Require a second path to metadata bdev be given via mount option.

Verify meta sb matches sb also written to data sb. Change code as needed
in super.c to allow both to be read. Remove check for overlapping
meta and data blknos, since they are now on entirely separate bdevs.

Use meta_bdev for superblock, quorum, and block.c reads and writes.

Signed-off-by: Andy Grover <agrover@versity.com>
This commit is contained in:
Andy Grover
2020-10-21 10:30:37 -07:00
committed by Zach Brown
parent ff532eba75
commit 9f151fde92
8 changed files with 221 additions and 34 deletions

View File

@@ -386,6 +386,7 @@ static void block_bio_end_io(struct bio *bio, int err)
static int block_submit_bio(struct super_block *sb, struct block_private *bp,
int rw)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct bio *bio = NULL;
struct blk_plug plug;
struct page *page;
@@ -414,7 +415,7 @@ static int block_submit_bio(struct super_block *sb, struct block_private *bp,
}
bio->bi_sector = sector + (off >> 9);
bio->bi_bdev = sb->s_bdev;
bio->bi_bdev = sbi->meta_bdev;
bio->bi_end_io = block_bio_end_io;
bio->bi_private = bp;
@@ -864,7 +865,7 @@ static void sm_block_bio_end_io(struct bio *bio, int err)
* only layer that sees the full block buffer so we pass the calculated
* crc to the caller for them to check in their context.
*/
static int sm_block_io(struct super_block *sb, int rw, u64 blkno,
static int sm_block_io(struct block_device *bdev, int rw, u64 blkno,
struct scoutfs_block_header *hdr, size_t len,
__le32 *blk_crc)
{
@@ -902,7 +903,7 @@ static int sm_block_io(struct super_block *sb, int rw, u64 blkno,
}
bio->bi_sector = blkno << (SCOUTFS_BLOCK_SM_SHIFT - 9);
bio->bi_bdev = sb->s_bdev;
bio->bi_bdev = bdev;
bio->bi_end_io = sm_block_bio_end_io;
bio->bi_private = &sbc;
bio_add_page(bio, page, SCOUTFS_BLOCK_SM_SIZE, 0);
@@ -925,17 +926,19 @@ out:
return ret;
}
int scoutfs_block_read_sm(struct super_block *sb, u64 blkno,
int scoutfs_block_read_sm(struct super_block *sb,
struct block_device *bdev, u64 blkno,
struct scoutfs_block_header *hdr, size_t len,
__le32 *blk_crc)
{
return sm_block_io(sb, READ, blkno, hdr, len, blk_crc);
return sm_block_io(bdev, READ, blkno, hdr, len, blk_crc);
}
int scoutfs_block_write_sm(struct super_block *sb, u64 blkno,
int scoutfs_block_write_sm(struct super_block *sb,
struct block_device *bdev, u64 blkno,
struct scoutfs_block_header *hdr, size_t len)
{
return sm_block_io(sb, WRITE, blkno, hdr, len, NULL);
return sm_block_io(bdev, WRITE, blkno, hdr, len, NULL);
}
int scoutfs_block_setup(struct super_block *sb)

View File

@@ -46,10 +46,12 @@ bool scoutfs_block_writer_has_dirty(struct super_block *sb,
u64 scoutfs_block_writer_dirty_bytes(struct super_block *sb,
struct scoutfs_block_writer *wri);
int scoutfs_block_read_sm(struct super_block *sb, u64 blkno,
int scoutfs_block_read_sm(struct super_block *sb,
struct block_device *bdev, u64 blkno,
struct scoutfs_block_header *hdr, size_t len,
__le32 *blk_crc);
int scoutfs_block_write_sm(struct super_block *sb, u64 blkno,
int scoutfs_block_write_sm(struct super_block *sb,
struct block_device *bdev, u64 blkno,
struct scoutfs_block_header *hdr, size_t len);
int scoutfs_block_setup(struct super_block *sb);

View File

@@ -61,6 +61,12 @@
#define SCOUTFS_QUORUM_BLKNO ((256ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
#define SCOUTFS_QUORUM_BLOCKS ((256ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
/*
* Start data on the data device aligned as well.
*/
#define SCOUTFS_DATA_DEV_START_BLKNO ((256ULL * 1024) >> SCOUTFS_BLOCK_SM_SHIFT)
#define SCOUTFS_UNIQUE_NAME_MAX_BYTES 64 /* includes null */
/*
@@ -585,10 +591,13 @@ struct scoutfs_quorum_block {
((SCOUTFS_BLOCK_SM_SIZE - sizeof(struct scoutfs_quorum_block)) / \
sizeof(struct scoutfs_quorum_log))
#define SCOUTFS_FLAG_IS_META_BDEV 0x01
struct scoutfs_super_block {
struct scoutfs_block_header hdr;
__le64 id;
__le64 format_hash;
__le64 flags;
__u8 uuid[SCOUTFS_UUID_BYTES];
__le64 next_ino;
__le64 next_trans_seq;

View File

@@ -16,6 +16,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/debugfs.h>
#include <linux/namei.h>
#include <linux/parser.h>
#include <linux/inet.h>
@@ -28,6 +29,7 @@
static const match_table_t tokens = {
{Opt_server_addr, "server_addr=%s"},
{Opt_metadev_path, "metadev_path=%s"},
{Opt_err, NULL}
};
@@ -81,6 +83,52 @@ static int parse_ipv4(struct super_block *sb, char *str,
return 0;
}
static int parse_bdev_path(struct super_block *sb, substring_t *substr,
char **bdev_path_ret)
{
char *bdev_path;
struct inode *bdev_inode;
struct path path;
bool got_path = false;
int ret;
bdev_path = match_strdup(substr);
if (!bdev_path) {
scoutfs_err(sb, "bdev string dup failed");
ret = -ENOMEM;
goto out;
}
ret = kern_path(bdev_path, LOOKUP_FOLLOW, &path);
if (ret) {
scoutfs_err(sb, "path %s not found for bdev: error %d",
bdev_path, ret);
goto out;
}
got_path = true;
bdev_inode = d_inode(path.dentry);
if (!S_ISBLK(bdev_inode->i_mode)) {
scoutfs_err(sb, "path %s for bdev is not a block device",
bdev_path);
ret = -ENOTBLK;
goto out;
}
out:
if (got_path) {
path_put(&path);
}
if (ret < 0) {
kfree(bdev_path);
} else {
*bdev_path_ret = bdev_path;
}
return ret;
}
int scoutfs_parse_options(struct super_block *sb, char *options,
struct mount_options *parsed)
{
@@ -106,6 +154,13 @@ int scoutfs_parse_options(struct super_block *sb, char *options,
if (ret < 0)
return ret;
break;
case Opt_metadev_path:
ret = parse_bdev_path(sb, &args[0],
&parsed->metadev_path);
if (ret < 0)
return ret;
break;
default:
scoutfs_err(sb, "Unknown or malformed option, \"%s\"",
p);
@@ -113,6 +168,11 @@ int scoutfs_parse_options(struct super_block *sb, char *options,
}
}
if (!parsed->metadev_path) {
scoutfs_err(sb, "Required mount option \"metadev_path\" not found");
return -EINVAL;
}
return 0;
}

View File

@@ -7,11 +7,13 @@
enum {
Opt_server_addr,
Opt_metadev_path,
Opt_err,
};
struct mount_options {
struct sockaddr_in server_addr;
char *metadev_path;
};
int scoutfs_parse_options(struct super_block *sb, char *options,

View File

@@ -112,12 +112,13 @@ static ktime_t random_to(u32 lo, u32 hi)
/*
* The caller is about to read all the quorum blocks. We invalidate any
* cached blocks and issue one large contiguous read to repopulate the
* cache. The caller then uses normal sb_bread to read each block. I'm
* cache. The caller then uses normal __bread to read each block. I'm
* not a huge fan of the plug but I couldn't get the individual
* readahead requests merged without it.
*/
static void readahead_quorum_blocks(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct buffer_head *bh;
struct blk_plug plug;
int i;
@@ -125,7 +126,8 @@ static void readahead_quorum_blocks(struct super_block *sb)
blk_start_plug(&plug);
for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
bh = sb_getblk(sb, SCOUTFS_QUORUM_BLKNO + i);
bh = __getblk(sbi->meta_bdev, SCOUTFS_QUORUM_BLKNO + i,
SCOUTFS_BLOCK_SM_SIZE);
if (!bh)
continue;
@@ -215,6 +217,7 @@ static bool stale_quorum_block(struct scoutfs_quorum_block *a,
static int read_quorum_blocks(struct super_block *sb, struct list_head *blocks)
{
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_quorum_block *blk;
struct quorum_block_head *qbh;
struct quorum_block_head *tmp;
@@ -227,7 +230,8 @@ static int read_quorum_blocks(struct super_block *sb, struct list_head *blocks)
for (i = 0; i < SCOUTFS_QUORUM_BLOCKS; i++) {
brelse(bh);
bh = sb_bread(sb, SCOUTFS_QUORUM_BLKNO + i);
bh = __bread(sbi->meta_bdev, SCOUTFS_QUORUM_BLKNO + i,
SCOUTFS_BLOCK_SM_SIZE);
if (!bh) {
scoutfs_inc_counter(sb, quorum_read_block_error);
ret = -EIO;
@@ -291,6 +295,7 @@ static int write_quorum_block(struct super_block *sb,
struct scoutfs_quorum_block *our_blk)
{
struct scoutfs_super_block *super = &SCOUTFS_SB(sb)->super;
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
struct scoutfs_quorum_block *blk;
struct buffer_head *bh = NULL;
size_t size;
@@ -299,8 +304,9 @@ static int write_quorum_block(struct super_block *sb,
BUILD_BUG_ON(sizeof(struct scoutfs_quorum_block) >
SCOUTFS_BLOCK_SM_SIZE);
bh = sb_getblk(sb, SCOUTFS_QUORUM_BLKNO +
prandom_u32_max(SCOUTFS_QUORUM_BLOCKS));
bh = __getblk(sbi->meta_bdev, SCOUTFS_QUORUM_BLKNO +
prandom_u32_max(SCOUTFS_QUORUM_BLOCKS),
SCOUTFS_BLOCK_SM_SIZE);
if (bh == NULL) {
ret = -EIO;
goto out;

View File

@@ -177,6 +177,7 @@ static int scoutfs_show_options(struct seq_file *seq, struct dentry *root)
struct mount_options *opts = &SCOUTFS_SB(sb)->opts;
seq_printf(seq, ",server_addr="SIN_FMT, SIN_ARG(&opts->server_addr));
seq_printf(seq, ",metadev_path=%s", opts->metadev_path);
return 0;
}
@@ -205,6 +206,20 @@ static int scoutfs_sync_fs(struct super_block *sb, int wait)
return scoutfs_trans_sync(sb, wait);
}
/*
* Data dev is closed by generic code, but we have to explicitly close the meta
* dev.
*/
static void scoutfs_metadev_close(struct super_block *sb)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
if (sbi->meta_bdev) {
blkdev_put(sbi->meta_bdev, SCOUTFS_META_BDEV_MODE);
sbi->meta_bdev = NULL;
}
}
/*
* This destroys all the state that's built up in the sb info during
* mount. It's called by us on errors during mount if we haven't set
@@ -247,6 +262,9 @@ static void scoutfs_put_super(struct super_block *sb)
debugfs_remove(sbi->debug_root);
scoutfs_destroy_counters(sb);
scoutfs_destroy_sysfs(sb);
scoutfs_metadev_close(sb);
kfree(sbi->opts.metadev_path);
kfree(sbi);
sb->s_fs_info = NULL;
@@ -271,18 +289,21 @@ static const struct super_operations scoutfs_super_ops = {
int scoutfs_write_super(struct super_block *sb,
struct scoutfs_super_block *super)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
le64_add_cpu(&super->hdr.seq, 1);
return scoutfs_block_write_sm(sb, SCOUTFS_SUPER_BLKNO, &super->hdr,
return scoutfs_block_write_sm(sb, sbi->meta_bdev, SCOUTFS_SUPER_BLKNO,
&super->hdr,
sizeof(struct scoutfs_super_block));
}
/*
* Read the super block. If it's valid store it in the caller's super
* struct.
* Read super, specifying bdev.
*/
int scoutfs_read_super(struct super_block *sb,
struct scoutfs_super_block *super_res)
static int scoutfs_read_super_from_bdev(struct super_block *sb,
struct block_device *bdev,
struct scoutfs_super_block *super_res)
{
struct scoutfs_super_block *super;
__le32 calc;
@@ -293,9 +314,8 @@ int scoutfs_read_super(struct super_block *sb,
if (!super)
return -ENOMEM;
ret = scoutfs_block_read_sm(sb, SCOUTFS_SUPER_BLKNO, &super->hdr,
sizeof(struct scoutfs_super_block),
&calc);
ret = scoutfs_block_read_sm(sb, bdev, SCOUTFS_SUPER_BLKNO, &super->hdr,
sizeof(struct scoutfs_super_block), &calc);
if (ret < 0)
goto out;
@@ -357,15 +377,6 @@ int scoutfs_read_super(struct super_block *sb,
goto out;
}
blkno = (le64_to_cpu(super->last_meta_blkno) + 1) <<
SCOUTFS_BLOCK_SM_LG_SHIFT;
if (le64_to_cpu(super->first_data_blkno) < blkno) {
scoutfs_err(sb, "super block first data blkno %llu is within last meta blkno %llu",
le64_to_cpu(super->first_data_blkno), blkno);
ret = -EINVAL;
goto out;
}
if (le64_to_cpu(super->first_data_blkno) >
le64_to_cpu(super->last_data_blkno)) {
scoutfs_err(sb, "super block first data blkno %llu is greater than last data blkno %llu",
@@ -384,13 +395,25 @@ int scoutfs_read_super(struct super_block *sb,
goto out;
}
*super_res = *super;
ret = 0;
out:
if (ret == 0)
*super_res = *super;
kfree(super);
return ret;
}
/*
* Read the super block from meta dev.
*/
int scoutfs_read_super(struct super_block *sb,
struct scoutfs_super_block *super_res)
{
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
return scoutfs_read_super_from_bdev(sb, sbi->meta_bdev, super_res);
}
/*
* This needs to be setup after reading the super because it uses the
* fsid found in the super block.
@@ -427,10 +450,66 @@ static int assign_random_id(struct scoutfs_sb_info *sbi)
return 0;
}
/*
* Ensure superblock copies in metadata and data block devices are valid, and
* fill in in-memory superblock if so.
*/
static int scoutfs_read_supers(struct super_block *sb)
{
struct scoutfs_super_block *meta_super = NULL;
struct scoutfs_super_block *data_super = NULL;
struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb);
int ret = 0;
meta_super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
data_super = kmalloc(sizeof(struct scoutfs_super_block), GFP_NOFS);
if (!meta_super || !data_super) {
ret = -ENOMEM;
goto out;
}
ret = scoutfs_read_super_from_bdev(sb, sbi->meta_bdev, meta_super);
if (ret < 0) {
scoutfs_err(sb, "could not get meta_super: error %d", ret);
goto out;
}
ret = scoutfs_read_super_from_bdev(sb, sb->s_bdev, data_super);
if (ret < 0) {
scoutfs_err(sb, "could not get data_super: error %d", ret);
goto out;
}
if (!SCOUTFS_IS_META_BDEV(meta_super)) {
scoutfs_err(sb, "meta_super META flag not set");
ret = -EINVAL;
goto out;
}
if (SCOUTFS_IS_META_BDEV(data_super)) {
scoutfs_err(sb, "data_super META flag set");
ret = -EINVAL;
goto out;
}
if (memcmp(meta_super->uuid, data_super->uuid, SCOUTFS_UUID_BYTES)) {
scoutfs_err(sb, "superblock UUID mismatch");
ret = -EINVAL;
goto out;
}
sbi->super = *meta_super;
out:
kfree(meta_super);
kfree(data_super);
return ret;
}
static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct scoutfs_sb_info *sbi;
struct mount_options opts;
struct block_device *meta_bdev;
struct inode *inode;
int ret;
@@ -476,7 +555,24 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent)
goto out;
}
ret = scoutfs_read_super(sb, &SCOUTFS_SB(sb)->super) ?:
meta_bdev =
blkdev_get_by_path(sbi->opts.metadev_path,
SCOUTFS_META_BDEV_MODE, sb);
if (IS_ERR(meta_bdev)) {
scoutfs_err(sb, "could not open metadev: error %ld",
PTR_ERR(meta_bdev));
ret = PTR_ERR(meta_bdev);
goto out;
}
sbi->meta_bdev = meta_bdev;
ret = set_blocksize(sbi->meta_bdev, SCOUTFS_BLOCK_SM_SIZE);
if (ret != 0) {
scoutfs_err(sb, "failed to set metadev blocksize, returned %d",
ret);
goto out;
}
ret = scoutfs_read_supers(sb) ?:
scoutfs_debugfs_setup(sb) ?:
scoutfs_setup_sysfs(sb) ?:
scoutfs_setup_counters(sb) ?:

View File

@@ -36,6 +36,8 @@ struct scoutfs_sb_info {
struct scoutfs_super_block super;
struct block_device *meta_bdev;
spinlock_t next_ino_lock;
struct data_info *data_info;
@@ -94,6 +96,13 @@ static inline bool SCOUTFS_HAS_SBI(struct super_block *sb)
return (sb != NULL) && (SCOUTFS_SB(sb) != NULL);
}
static inline bool SCOUTFS_IS_META_BDEV(struct scoutfs_super_block *super_block)
{
return !!(super_block->flags & SCOUTFS_FLAG_IS_META_BDEV);
}
#define SCOUTFS_META_BDEV_MODE (FMODE_READ | FMODE_WRITE | FMODE_EXCL)
/*
* A small string embedded in messages that's used to identify a
* specific mount. It's the three most significant bytes of the fsid