From 5f11cdbfe53da21d23c7aa7974537f8b07165eea Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 19 May 2017 13:51:00 -0700 Subject: [PATCH] scoutfs: add and index inode meta and data seqs For each transaction we send a message to to the server asking for a unique sequence number to associate with the transaction. When we change metadata or data of an inode we store the current transaction seq in the inode and we index it with index items like the other inode fields. The server remembers the sequences it gives out. When we go to walk the inode sequence indexes we ask the server for the largest stable seq and limit results to that seq. This ensures that we never return seqs that are past dirty items so never have inodes and seqs appear in the past. Nodes use the sync timer to regularly cycle through seqs and ensure that inode seq index walks don't get stuck on their otherwise idle seq. Signed-off-by: Zach Brown --- kmod/src/data.c | 6 +- kmod/src/format.h | 15 ++++ kmod/src/inode.c | 88 ++++++++++++++++-- kmod/src/inode.h | 12 ++- kmod/src/ioctl.c | 46 +++++----- kmod/src/ioctl.h | 15 ++++ kmod/src/net.c | 225 ++++++++++++++++++++++++++++++++++++++++++++++ kmod/src/net.h | 2 + kmod/src/super.c | 4 + kmod/src/super.h | 2 + kmod/src/trans.c | 13 +++ 11 files changed, 396 insertions(+), 32 deletions(-) diff --git a/kmod/src/data.c b/kmod/src/data.c index 98dc2143..76cdd2a5 100644 --- a/kmod/src/data.c +++ b/kmod/src/data.c @@ -1062,6 +1062,7 @@ static int scoutfs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; + struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; int ret; @@ -1070,7 +1071,10 @@ static int scoutfs_write_end(struct file *file, struct address_space *mapping, ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); if (ret > 0) { - scoutfs_inode_inc_data_version(inode); + if (!si->staging) { + scoutfs_inode_set_data_seq(inode); + scoutfs_inode_inc_data_version(inode); + } /* XXX kind of a big hammer, inode life cycle needs work */ scoutfs_update_inode_item(inode); scoutfs_inode_queue_writeback(inode); diff --git a/kmod/src/format.h b/kmod/src/format.h index 4afd26f6..dd991c2d 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -163,6 +163,8 @@ struct scoutfs_segment_block { #define SCOUTFS_INODE_INDEX_CTIME_KEY 13 #define SCOUTFS_INODE_INDEX_MTIME_KEY 14 #define SCOUTFS_INODE_INDEX_SIZE_KEY 15 +#define SCOUTFS_INODE_INDEX_META_SEQ_KEY 16 +#define SCOUTFS_INODE_INDEX_DATA_SEQ_KEY 17 /* not found in the fs */ #define SCOUTFS_MAX_UNUSED_KEY 253 #define SCOUTFS_NET_ADDR_KEY 254 @@ -280,6 +282,7 @@ struct scoutfs_super_block { __le64 id; __u8 uuid[SCOUTFS_UUID_BYTES]; __le64 next_ino; + __le64 next_seq; __le64 alloc_uninit; __le64 total_segs; __le64 free_segs; @@ -300,6 +303,14 @@ struct scoutfs_timespec { } __packed; /* + * @meta_seq: advanced the first time an inode is updated in a given + * transaction. It can only advance again after the inode is written + * and a new transaction opens. + * + * @data_seq: advanced the first time a file's data (or size) is + * modified in a given transaction. It can only advance again after the + * file is written and a new transaction opens. + * * @data_version: incremented every time the contents of a file could * have changed. It is exposed via an ioctl and is then provided as an * argument to data functions to protect racing modification. @@ -314,6 +325,8 @@ struct scoutfs_timespec { struct scoutfs_inode { __le64 size; __le64 blocks; + __le64 meta_seq; + __le64 data_seq; __le64 data_version; __le64 next_readdir_pos; __le32 nlink; @@ -431,6 +444,8 @@ enum { SCOUTFS_NET_ALLOC_SEGNO, SCOUTFS_NET_RECORD_SEGMENT, SCOUTFS_NET_BULK_ALLOC, + SCOUTFS_NET_ADVANCE_SEQ, + SCOUTFS_NET_GET_LAST_SEQ, SCOUTFS_NET_UNKNOWN, }; diff --git a/kmod/src/inode.c b/kmod/src/inode.c index 43c6c730..c1a65eef 100644 --- a/kmod/src/inode.c +++ b/kmod/src/inode.c @@ -192,6 +192,8 @@ static void set_item_info(struct inode *inode) si->item_size = i_size_read(inode); si->item_ctime = inode->i_ctime; si->item_mtime = inode->i_mtime; + si->item_meta_seq = scoutfs_inode_meta_seq(inode); + si->item_data_seq = scoutfs_inode_data_seq(inode); } static void load_inode(struct inode *inode, struct scoutfs_inode *cinode) @@ -211,6 +213,8 @@ static void load_inode(struct inode *inode, struct scoutfs_inode *cinode) inode->i_ctime.tv_sec = le64_to_cpu(cinode->ctime.sec); inode->i_ctime.tv_nsec = le32_to_cpu(cinode->ctime.nsec); + ci->meta_seq = le64_to_cpu(cinode->meta_seq); + ci->data_seq = le64_to_cpu(cinode->data_seq); ci->data_version = le64_to_cpu(cinode->data_version); ci->next_readdir_pos = le64_to_cpu(cinode->next_readdir_pos); @@ -245,31 +249,84 @@ static int scoutfs_read_locked_inode(struct inode *inode) return ret; } -void scoutfs_inode_inc_data_version(struct inode *inode) +/* + * Set a given seq to the current trans seq if it differs. The caller + * holds locks and a transaction which prevents the transaction from + * committing and refreshing the seq. + */ +static void set_trans_seq(struct inode *inode, u64 *seq) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); + struct super_block *sb = inode->i_sb; + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); - if (!si->staging) { + if (*seq != sbi->trans_seq) { preempt_disable(); write_seqcount_begin(&si->seqcount); - si->data_version++; + *seq = sbi->trans_seq; write_seqcount_end(&si->seqcount); preempt_enable(); } } -u64 scoutfs_inode_get_data_version(struct inode *inode) +void scoutfs_inode_set_meta_seq(struct inode *inode) +{ + struct scoutfs_inode_info *si = SCOUTFS_I(inode); + + set_trans_seq(inode, &si->meta_seq); +} + +void scoutfs_inode_set_data_seq(struct inode *inode) +{ + struct scoutfs_inode_info *si = SCOUTFS_I(inode); + + set_trans_seq(inode, &si->data_seq); +} + +void scoutfs_inode_inc_data_version(struct inode *inode) +{ + struct scoutfs_inode_info *si = SCOUTFS_I(inode); + + preempt_disable(); + write_seqcount_begin(&si->seqcount); + si->data_version++; + write_seqcount_end(&si->seqcount); + preempt_enable(); +} + +static u64 read_seqcount_u64(struct inode *inode, u64 *val) { struct scoutfs_inode_info *si = SCOUTFS_I(inode); unsigned int seq; - u64 vers; + u64 v; do { seq = read_seqcount_begin(&si->seqcount); - vers = si->data_version; + v = *val; } while (read_seqcount_retry(&si->seqcount, seq)); - return vers; + return v; +} + +u64 scoutfs_inode_meta_seq(struct inode *inode) +{ + struct scoutfs_inode_info *si = SCOUTFS_I(inode); + + return read_seqcount_u64(inode, &si->meta_seq); +} + +u64 scoutfs_inode_data_seq(struct inode *inode) +{ + struct scoutfs_inode_info *si = SCOUTFS_I(inode); + + return read_seqcount_u64(inode, &si->data_seq); +} + +u64 scoutfs_inode_data_version(struct inode *inode) +{ + struct scoutfs_inode_info *si = SCOUTFS_I(inode); + + return read_seqcount_u64(inode, &si->data_version); } static int scoutfs_iget_test(struct inode *inode, void *arg) @@ -332,7 +389,9 @@ static void store_inode(struct scoutfs_inode *cinode, struct inode *inode) cinode->mtime.sec = cpu_to_le64(inode->i_mtime.tv_sec); cinode->mtime.nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - cinode->data_version = cpu_to_le64(ci->data_version); + cinode->meta_seq = cpu_to_le64(scoutfs_inode_meta_seq(inode)); + cinode->data_seq = cpu_to_le64(scoutfs_inode_data_seq(inode)); + cinode->data_version = cpu_to_le64(scoutfs_inode_data_version(inode)); cinode->next_readdir_pos = cpu_to_le64(ci->next_readdir_pos); } @@ -465,6 +524,9 @@ void scoutfs_update_inode_item(struct inode *inode) int ret; int err; + /* set the meta version once per trans for any inode updates */ + scoutfs_inode_set_meta_seq(inode); + ret = update_index(inode, SCOUTFS_INODE_INDEX_CTIME_KEY, inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, si->item_ctime.tv_sec, si->item_ctime.tv_nsec) ?: @@ -472,7 +534,13 @@ void scoutfs_update_inode_item(struct inode *inode) inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, si->item_mtime.tv_sec, si->item_mtime.tv_nsec) ?: update_index(inode, SCOUTFS_INODE_INDEX_SIZE_KEY, - i_size_read(inode), 0, si->item_size, 0); + i_size_read(inode), 0, si->item_size, 0) ?: + update_index(inode, SCOUTFS_INODE_INDEX_META_SEQ_KEY, + scoutfs_inode_meta_seq(inode), 0, + si->item_meta_seq, 0) ?: + update_index(inode, SCOUTFS_INODE_INDEX_DATA_SEQ_KEY, + scoutfs_inode_data_seq(inode), 0, + si->item_data_seq, 0); BUG_ON(ret); store_inode(&sinode, inode); @@ -656,6 +724,8 @@ struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir, ci = SCOUTFS_I(inode); ci->ino = ino; + ci->meta_seq = 0; + ci->data_seq = 0; ci->data_version = 0; ci->next_readdir_pos = SCOUTFS_DIRENT_FIRST_POS; ci->have_item = false; diff --git a/kmod/src/inode.h b/kmod/src/inode.h index 5f453996..d95139c0 100644 --- a/kmod/src/inode.h +++ b/kmod/src/inode.h @@ -6,12 +6,16 @@ struct scoutfs_inode_info { /* read or initialized for each inode instance */ u64 ino; - u64 data_version; u64 next_readdir_pos; + u64 meta_seq; + u64 data_seq; + u64 data_version; bool have_item; u64 item_size; struct timespec item_ctime; struct timespec item_mtime; + u64 item_meta_seq; + u64 item_data_seq; /* initialized once for slab object */ seqcount_t seqcount; @@ -48,8 +52,12 @@ void scoutfs_update_inode_item(struct inode *inode); void scoutfs_inode_fill_pool(struct super_block *sb, u64 ino, u64 nr); struct inode *scoutfs_new_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t rdev); +void scoutfs_inode_set_meta_seq(struct inode *inode); +void scoutfs_inode_set_data_seq(struct inode *inode); void scoutfs_inode_inc_data_version(struct inode *inode); -u64 scoutfs_inode_get_data_version(struct inode *inode); +u64 scoutfs_inode_meta_seq(struct inode *inode); +u64 scoutfs_inode_data_seq(struct inode *inode); +u64 scoutfs_inode_data_version(struct inode *inode); int scoutfs_scan_orphans(struct super_block *sb); diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index 3e0807cc..e5109853 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -30,6 +30,7 @@ #include "trans.h" #include "item.h" #include "data.h" +#include "net.h" /* * Walk one of the inode index items. This is a thin ioctl wrapper @@ -45,6 +46,7 @@ static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg) struct scoutfs_inode_index_key ikey; struct scoutfs_key_buf last_key; struct scoutfs_key_buf key; + u64 last_seq; int ret = 0; u32 nr; @@ -62,9 +64,28 @@ static long scoutfs_ioc_walk_inodes(struct file *file, unsigned long arg) ikey.type = SCOUTFS_INODE_INDEX_MTIME_KEY; else if (walk.index == SCOUTFS_IOC_WALK_INODES_SIZE) ikey.type = SCOUTFS_INODE_INDEX_SIZE_KEY; + else if (walk.index == SCOUTFS_IOC_WALK_INODES_META_SEQ) + ikey.type = SCOUTFS_INODE_INDEX_META_SEQ_KEY; + else if (walk.index == SCOUTFS_IOC_WALK_INODES_DATA_SEQ) + ikey.type = SCOUTFS_INODE_INDEX_DATA_SEQ_KEY; else return -EINVAL; + /* clamp results to the inodes in the farthest stable seq */ + if (ikey.type == SCOUTFS_INODE_INDEX_META_SEQ_KEY || + ikey.type == SCOUTFS_INODE_INDEX_DATA_SEQ_KEY) { + + ret = scoutfs_net_get_last_seq(sb, &last_seq); + if (ret) + return ret; + + if (last_seq < walk.last.major) { + walk.last.major = last_seq; + walk.last.minor = ~0; + walk.last.ino = ~0ULL; + } + } + ikey.major = cpu_to_be64(walk.first.major); ikey.minor = cpu_to_be32(walk.first.minor); ikey.ino = cpu_to_be64(walk.first.ino); @@ -218,21 +239,6 @@ out: return ret; } -/* - * Sample the inode's data_version. It is not strictly serialized with - * writes that are in flight. - */ -static long scoutfs_ioc_data_version(struct file *file, unsigned long arg) -{ - u64 __user *uvers = (void __user *)arg; - u64 vers = scoutfs_inode_get_data_version(file_inode(file)); - - if (put_user(vers, uvers)) - return -EFAULT; - - return 0; -} - /* * The caller has a version of the data available in the given byte * range in an external archive. As long as the data version still @@ -291,7 +297,7 @@ static long scoutfs_ioc_release(struct file *file, unsigned long arg) goto out; } - if (scoutfs_inode_get_data_version(inode) != args.data_version) { + if (scoutfs_inode_data_version(inode) != args.data_version) { ret = -ESTALE; goto out; } @@ -386,7 +392,7 @@ static long scoutfs_ioc_stage(struct file *file, unsigned long arg) goto out; } - if (scoutfs_inode_get_data_version(inode) != args.data_version) { + if (scoutfs_inode_data_version(inode) != args.data_version) { ret = -ESTALE; goto out; } @@ -423,7 +429,9 @@ static long scoutfs_ioc_stat_more(struct file *file, unsigned long arg) stm.valid_bytes = min_t(u64, stm.valid_bytes, sizeof(struct scoutfs_ioctl_stat_more)); - stm.data_version = scoutfs_inode_get_data_version(inode); + stm.meta_seq = scoutfs_inode_meta_seq(inode); + stm.data_seq = scoutfs_inode_data_seq(inode); + stm.data_version = scoutfs_inode_data_version(inode); if (copy_to_user((void __user *)arg, &stm, stm.valid_bytes)) return -EFAULT; @@ -438,8 +446,6 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return scoutfs_ioc_walk_inodes(file, arg); case SCOUTFS_IOC_INO_PATH: return scoutfs_ioc_ino_path(file, arg); - case SCOUTFS_IOC_DATA_VERSION: - return scoutfs_ioc_data_version(file, arg); case SCOUTFS_IOC_RELEASE: return scoutfs_ioc_release(file, arg); case SCOUTFS_IOC_STAGE: diff --git a/kmod/src/ioctl.h b/kmod/src/ioctl.h index a32a6724..d1814b8c 100644 --- a/kmod/src/ioctl.h +++ b/kmod/src/ioctl.h @@ -31,7 +31,18 @@ struct scoutfs_ioctl_walk_inodes_entry { * minor < major) as each increasingly significant value wraps around to * 0. * + * These indexes are not strictly consistent. The items that back these + * index entries aren't updated with cluster locks so they're not + * guaranteed to be visible the moment you read after writing. They're + * only visible when the transaction that updated them is synced. + * + * In addition, the seq indexes will only allow walking through sequence + * space that has been consistent. This prevents old dirty entries from + * becoming visible after newer stable entries are displayed. + * * If first is greater than last then the walk will return 0 entries. + * + * XXX invalidate before reading. */ struct scoutfs_ioctl_walk_inodes { struct scoutfs_ioctl_walk_inodes_entry first; @@ -45,6 +56,8 @@ enum { SCOUTFS_IOC_WALK_INODES_CTIME = 0, SCOUTFS_IOC_WALK_INODES_MTIME, SCOUTFS_IOC_WALK_INODES_SIZE, + SCOUTFS_IOC_WALK_INODES_META_SEQ, + SCOUTFS_IOC_WALK_INODES_DATA_SEQ, SCOUTFS_IOC_WALK_INODES_UNKNOWN, }; @@ -145,6 +158,8 @@ struct scoutfs_ioctl_stage { */ struct scoutfs_ioctl_stat_more { __u64 valid_bytes; + __u64 meta_seq; + __u64 data_seq; __u64 data_version; } __packed; diff --git a/kmod/src/net.c b/kmod/src/net.c index eb34d12b..1d2662c2 100644 --- a/kmod/src/net.c +++ b/kmod/src/net.c @@ -82,6 +82,10 @@ struct net_info { struct llist_head ring_commit_waiters; struct work_struct ring_commit_work; + /* server tracks seq use */ + spinlock_t seq_lock; + struct list_head pending_seqs; + /* both track active sockets for destruction */ struct list_head active_socks; @@ -628,6 +632,132 @@ static struct send_buf *process_alloc_inodes(struct super_block *sb, return sbuf; } +struct pending_seq { + struct list_head head; + u64 seq; +}; + +/* + * Give the client the next seq for it to use in items in its + * transaction. They tell us the seq they just used so we can remove it + * from pending tracking and possibly include it in get_last_seq + * replies. + * + * The list walk is O(clients) and the message processing rate goes from + * every committed segment to every sync deadline interval. + * + * XXX The pending seq tracking should be persistent so that it survives + * server failover. + */ +static struct send_buf *process_advance_seq(struct super_block *sb, + void *req, int req_len) +{ + DECLARE_NET_INFO(sb, nti); + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + struct pending_seq *next_ps; + struct pending_seq *ps; + struct commit_waiter cw; + __le64 * __packed prev; + __le64 * __packed next; + struct send_buf *sbuf; + int ret; + + if (req_len != sizeof(__le64)) + return ERR_PTR(-EINVAL); + + prev = req; + + sbuf = alloc_sbuf(sizeof(__le64)); + if (!sbuf) + return ERR_PTR(-ENOMEM); + + next = (void *)sbuf->nh->data; + + next_ps = kmalloc(sizeof(struct pending_seq), GFP_NOFS); + if (!next_ps) { + ret = -ENOMEM; + goto out; + } + + down_read(&nti->ring_commit_rwsem); + + spin_lock(&nti->seq_lock); + + list_for_each_entry(ps, &nti->pending_seqs, head) { + if (ps->seq == le64_to_cpu(*prev)) { + list_del_init(&ps->head); + kfree(ps); + break; + } + } + + *next = super->next_seq; + le64_add_cpu(&super->next_seq, 1); + + trace_printk("prev %llu next %llu, super next_seq %llu\n", + le64_to_cpup(prev), le64_to_cpup(next), + le64_to_cpu(super->next_seq)); + + next_ps->seq = le64_to_cpup(next); + list_add_tail(&next_ps->head, &nti->pending_seqs); + + spin_unlock(&nti->seq_lock); + + queue_commit_work(nti, &cw); + up_read(&nti->ring_commit_rwsem); + + ret = wait_for_commit(&cw); +out: + if (ret < 0) + sbuf->nh->status = SCOUTFS_NET_STATUS_ERROR; + else + sbuf->nh->status = SCOUTFS_NET_STATUS_SUCCESS; + + return sbuf; +} + +/* + * Give the client the last seq that is stable before the lowest seq + * that is still dirty out at a client. + */ +static struct send_buf *process_get_last_seq(struct super_block *sb, + void *req, int req_len) +{ + DECLARE_NET_INFO(sb, nti); + struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + struct scoutfs_super_block *super = &sbi->super; + struct pending_seq *ps; + __le64 * __packed last; + struct send_buf *sbuf; + + if (req_len != 0) + return ERR_PTR(-EINVAL); + + sbuf = alloc_sbuf(sizeof(__le64)); + if (!sbuf) + return ERR_PTR(-ENOMEM); + + last = (void *)sbuf->nh->data; + + spin_lock(&nti->seq_lock); + ps = list_first_entry_or_null(&nti->pending_seqs, + struct pending_seq, head); + if (ps) { + *last = cpu_to_le64(ps->seq - 1); + } else { + *last = super->next_seq; + le64_add_cpu(last, -1ULL); + } + spin_unlock(&nti->seq_lock); + + trace_printk("last %llu\n", le64_to_cpup(last)); + + sbuf->nh->status = SCOUTFS_NET_STATUS_SUCCESS; + + return sbuf; +} + typedef struct send_buf *(*proc_func_t)(struct super_block *sb, void *req, int req_len); @@ -640,6 +770,8 @@ static proc_func_t type_proc_func(u8 type) [SCOUTFS_NET_ALLOC_SEGNO] = process_alloc_segno, [SCOUTFS_NET_RECORD_SEGMENT] = process_record_segment, [SCOUTFS_NET_BULK_ALLOC] = process_bulk_alloc, + [SCOUTFS_NET_ADVANCE_SEQ] = process_advance_seq, + [SCOUTFS_NET_GET_LAST_SEQ] = process_get_last_seq, }; return type < SCOUTFS_NET_UNKNOWN ? funcs[type] : NULL; @@ -726,9 +858,19 @@ static int process_reply(struct net_info *nti, struct recv_buf *rbuf) static void destroy_server_state(struct super_block *sb) { + DECLARE_NET_INFO(sb, nti); + struct pending_seq *ps; + struct pending_seq *tmp; + scoutfs_compact_destroy(sb); scoutfs_alloc_destroy(sb); scoutfs_manifest_destroy(sb); + + /* XXX these should be persistent and reclaimed during recovery */ + list_for_each_entry_safe(ps, tmp, &nti->pending_seqs, head) { + list_del_init(&ps->head); + kfree(ps); + } } /* @@ -1550,6 +1692,87 @@ int scoutfs_net_alloc_inodes(struct super_block *sb) alloc_inodes_reply, NULL); } +struct advance_seq_args { + u64 seq; + struct completion comp; + int ret; +}; + +static int advance_seq_reply(struct super_block *sb, void *reply, int ret, + void *arg) +{ + struct advance_seq_args *args = arg; + __le64 * __packed seq = reply; + + if (ret == sizeof(__le64)) { + args->seq = le64_to_cpup(seq); + args->ret = 0; + } else { + args->ret = -EINVAL; + } + + complete(&args->comp); /* args can be freed from this point */ + return args->ret; +} + +int scoutfs_net_advance_seq(struct super_block *sb, u64 *seq) +{ + struct advance_seq_args args; + __le64 leseq = cpu_to_le64p(seq); + int ret; + + init_completion(&args.comp); + + ret = add_send_buf(sb, SCOUTFS_NET_ADVANCE_SEQ, &leseq, + sizeof(leseq), advance_seq_reply, &args); + if (ret == 0) { + wait_for_completion(&args.comp); + *seq = args.seq; + ret = args.ret; + } + return ret; +} + +struct get_last_seq_args { + u64 seq; + struct completion comp; + int ret; +}; + +static int get_last_seq_reply(struct super_block *sb, void *reply, int ret, + void *arg) +{ + struct get_last_seq_args *args = arg; + __le64 * __packed seq = reply; + + if (ret == sizeof(__le64)) { + args->seq = le64_to_cpup(seq); + args->ret = 0; + } else { + args->ret = -EINVAL; + } + + complete(&args->comp); /* args can be freed from this point */ + return args->ret; +} + +int scoutfs_net_get_last_seq(struct super_block *sb, u64 *seq) +{ + struct get_last_seq_args args; + int ret; + + init_completion(&args.comp); + + ret = add_send_buf(sb, SCOUTFS_NET_GET_LAST_SEQ, NULL, 0, + get_last_seq_reply, &args); + if (ret == 0) { + wait_for_completion(&args.comp); + *seq = args.seq; + ret = args.ret; + } + return ret; +} + static struct sock_info *alloc_sinf(struct super_block *sb) { struct sock_info *sinf; @@ -1862,6 +2085,8 @@ int scoutfs_net_setup(struct super_block *sb) init_rwsem(&nti->ring_commit_rwsem); init_llist_head(&nti->ring_commit_waiters); INIT_WORK(&nti->ring_commit_work, scoutfs_net_ring_commit_func); + spin_lock_init(&nti->seq_lock); + INIT_LIST_HEAD(&nti->pending_seqs); INIT_LIST_HEAD(&nti->active_socks); sbi->net_info = nti; diff --git a/kmod/src/net.h b/kmod/src/net.h index e51d9266..ea131144 100644 --- a/kmod/src/net.h +++ b/kmod/src/net.h @@ -17,6 +17,8 @@ u64 *scoutfs_net_bulk_alloc(struct super_block *sb); int scoutfs_net_get_compaction(struct super_block *sb, void *curs); int scoutfs_net_finish_compaction(struct super_block *sb, void *curs, void *list); +int scoutfs_net_get_last_seq(struct super_block *sb, u64 *seq); +int scoutfs_net_advance_seq(struct super_block *sb, u64 *seq); int scoutfs_net_setup(struct super_block *sb); void scoutfs_net_destroy(struct super_block *sb); diff --git a/kmod/src/super.c b/kmod/src/super.c index f89bad20..aa2870dc 100644 --- a/kmod/src/super.c +++ b/kmod/src/super.c @@ -242,6 +242,10 @@ static int scoutfs_fill_super(struct super_block *sb, void *data, int silent) if (!sb->s_root) return -ENOMEM; + ret = scoutfs_net_advance_seq(sb, &sbi->trans_seq); + if (ret) + return ret; + scoutfs_trans_restart_sync_deadline(sb); // scoutfs_scan_orphans(sb); diff --git a/kmod/src/super.h b/kmod/src/super.h index 8bbd313b..39cc354a 100644 --- a/kmod/src/super.h +++ b/kmod/src/super.h @@ -39,10 +39,12 @@ struct scoutfs_sb_info { spinlock_t trans_write_lock; u64 trans_write_count; + u64 trans_seq; int trans_write_ret; struct delayed_work trans_write_work; wait_queue_head_t trans_write_wq; struct workqueue_struct *trans_write_workq; + bool trans_deadline_expired; struct lock_info *lock_info; struct net_info *net_info; diff --git a/kmod/src/trans.c b/kmod/src/trans.c index 58c12465..c55da90b 100644 --- a/kmod/src/trans.c +++ b/kmod/src/trans.c @@ -113,7 +113,18 @@ void scoutfs_trans_write_func(struct work_struct *work) goto out; scoutfs_inc_counter(sb, trans_level0_seg_write); + + } else if (sbi->trans_deadline_expired) { + /* + * If we're not writing data then we only advance the + * seq at the sync deadline interval. This keeps idle + * mounts from pinning a seq and stopping readers of the + * seq indices but doesn't send a message for every sync + * syscall. + */ + ret = scoutfs_net_advance_seq(sb, &sbi->trans_seq); } + out: /* XXX this all needs serious work for dealing with errors */ WARN_ON_ONCE(ret); @@ -160,6 +171,7 @@ static int write_attempted(struct scoutfs_sb_info *sbi, */ static void queue_trans_work(struct scoutfs_sb_info *sbi) { + sbi->trans_deadline_expired = false; mod_delayed_work(sbi->trans_write_workq, &sbi->trans_write_work, 0); } @@ -208,6 +220,7 @@ void scoutfs_trans_restart_sync_deadline(struct super_block *sb) { struct scoutfs_sb_info *sbi = SCOUTFS_SB(sb); + sbi->trans_deadline_expired = true; mod_delayed_work(sbi->trans_write_workq, &sbi->trans_write_work, TRANS_SYNC_DELAY); }