From b9a0f1709f15c020bf3f11932a47ca9316ddef4e Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 9 Jul 2021 15:41:50 -0700 Subject: [PATCH] Add xattr .totl. tag Add the .totl. xattr tag. When the tag is set the end of the name specifies a total name with 3 encoded u64s separated by dots. The value of the xattr is a u64 that is added to the named total. An ioctl is added to read the totals. Signed-off-by: Zach Brown --- kmod/src/btree.c | 136 ++++++++++++++-- kmod/src/counters.h | 9 ++ kmod/src/forest.c | 42 ++++- kmod/src/forest.h | 14 +- kmod/src/format.h | 22 ++- kmod/src/ioctl.c | 291 ++++++++++++++++++++++++++++++++++ kmod/src/ioctl.h | 51 ++++++ kmod/src/item.c | 88 +++++++++- kmod/src/item.h | 2 + kmod/src/lock.c | 14 ++ kmod/src/lock.h | 2 + kmod/src/xattr.c | 205 +++++++++++++++++++++++- kmod/src/xattr.h | 6 +- tests/golden/totl-xattr-tag | 30 ++++ tests/sequence | 1 + tests/tests/totl-xattr-tag.sh | 126 +++++++++++++++ utils/man/scoutfs.5 | 56 +++++++ utils/src/print.c | 14 ++ utils/src/read_xattr_totals.c | 120 ++++++++++++++ 19 files changed, 1194 insertions(+), 35 deletions(-) create mode 100644 tests/golden/totl-xattr-tag create mode 100644 tests/tests/totl-xattr-tag.sh create mode 100644 utils/src/read_xattr_totals.c diff --git a/kmod/src/btree.c b/kmod/src/btree.c index c05d0b06..90daa6ab 100644 --- a/kmod/src/btree.c +++ b/kmod/src/btree.c @@ -30,6 +30,7 @@ #include "avl.h" #include "hash.h" #include "sort_priv.h" +#include "forest.h" #include "scoutfs_trace.h" @@ -1902,9 +1903,23 @@ int scoutfs_btree_insert_list(struct super_block *sb, do { item = leaf_item_hash_search(sb, bt, &lst->key); if (item) { + /* try to merge delta values, _NULL not deleted; merge will */ + ret = scoutfs_forest_combine_deltas(&lst->key, + item_val(bt, item), + item_val_len(item), + lst->val, lst->val_len); + if (ret < 0) { + scoutfs_block_put(sb, bl); + goto out; + } + item->seq = cpu_to_le64(lst->seq); item->flags = lst->flags; - update_item_value(bt, item, lst->val, lst->val_len); + + if (ret == 0) + update_item_value(bt, item, lst->val, lst->val_len); + else + ret = 0; } else { scoutfs_avl_search(&bt->item_root, cmp_key_item, &lst->key, @@ -2039,6 +2054,16 @@ static struct merge_pos *first_mpos(struct rb_root *root) return NULL; } +static struct merge_pos *next_mpos(struct merge_pos *mpos) +{ + struct rb_node *node; + + if (mpos && (node = rb_next(&mpos->node))) + return container_of(node, struct merge_pos, node); + else + return NULL; +} + static void free_mpos(struct super_block *sb, struct merge_pos *mpos) { scoutfs_block_put(sb, mpos->bl); @@ -2142,6 +2167,56 @@ out: return ret; } +/* + * The caller has reset all the merge positions for all the input log + * btree roots and wants the next logged item it should try and merge + * with the items in the fs_root. + * + * We look ahead in the logged item stream to see if we should merge any + * older logged delta items into one result for the caller. We also + * take this opportunity to skip and reset the mpos for any older + * versions of the first item. + */ +static int next_resolved_mpos(struct super_block *sb, struct rb_root *pos_root, + struct scoutfs_key *end, struct merge_pos **mpos_ret) +{ + struct merge_pos *mpos; + struct merge_pos *next; + struct scoutfs_key key; + int ret = 0; + + while ((mpos = first_mpos(pos_root)) && (next = next_mpos(mpos)) && + !scoutfs_key_compare(mpos->key, next->key)) { + + ret = scoutfs_forest_combine_deltas(mpos->key, mpos->val, mpos->val_len, + next->val, next->val_len); + if (ret < 0) + break; + + /* reset advances to the next item */ + key = *mpos->key; + scoutfs_key_inc(&key); + + /* always skip next combined or older version */ + ret = reset_mpos(sb, pos_root, next, &key, end); + if (ret < 0) + break; + + if (ret == SCOUTFS_DELTA_COMBINED) { + scoutfs_inc_counter(sb, btree_merge_delta_combined); + } else if (ret == SCOUTFS_DELTA_COMBINED_NULL) { + scoutfs_inc_counter(sb, btree_merge_delta_null); + /* if merging resulted in no info, skip current */ + ret = reset_mpos(sb, pos_root, mpos, &key, end); + if (ret < 0) + break; + } + } + + *mpos_ret = mpos; + return ret; +} + /* * Merge items from a number of read-only input roots into a writable * destination root. The order of the input roots doesn't matter, the @@ -2179,6 +2254,7 @@ int scoutfs_btree_merge(struct super_block *sb, int walk_val_len; int walk_flags; bool is_del; + int delta; int cmp; int ret; @@ -2205,7 +2281,7 @@ int scoutfs_btree_merge(struct super_block *sb, walk_flags |= BTW_SUBTREE; walk_val_len = 0; - while ((mpos = first_mpos(&pos_root))) { + while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) { if (scoutfs_block_writer_dirty_bytes(sb, wri) >= dirty_limit) { scoutfs_inc_counter(sb, btree_merge_dirty_limit); @@ -2233,7 +2309,13 @@ int scoutfs_btree_merge(struct super_block *sb, bt = bl->data; scoutfs_inc_counter(sb, btree_merge_walk); - for (; mpos; mpos = first_mpos(&pos_root)) { + /* catch non-root blocks that fell under low, maybe from null deltas */ + if (root->ref.blkno != bt->hdr.blkno && !total_above_join_low_water(bt)) { + walk_flags |= BTW_DELETE; + continue; + } + + while ((ret = next_resolved_mpos(sb, &pos_root, end, &mpos)) == 0 && mpos) { /* walk to new leaf if we exceed parent ref key */ if (scoutfs_key_compare(mpos->key, &kr.end) > 0) @@ -2243,6 +2325,23 @@ int scoutfs_btree_merge(struct super_block *sb, item = leaf_item_hash_search(sb, bt, mpos->key); is_del = !!(mpos->flags & SCOUTFS_ITEM_FLAG_DELETION); + /* see if we're merging delta items */ + if (item && !is_del) + delta = scoutfs_forest_combine_deltas(mpos->key, + item_val(bt, item), + item_val_len(item), + mpos->val, mpos->val_len); + else + delta = 0; + if (delta < 0) { + ret = delta; + goto out; + } else if (delta == SCOUTFS_DELTA_COMBINED) { + scoutfs_inc_counter(sb, btree_merge_delta_combined); + } else if (delta == SCOUTFS_DELTA_COMBINED_NULL) { + scoutfs_inc_counter(sb, btree_merge_delta_null); + } + trace_scoutfs_btree_merge_items(sb, mpos->root, mpos->key, mpos->val_len, item ? root : NULL, @@ -2250,7 +2349,7 @@ int scoutfs_btree_merge(struct super_block *sb, item ? item_val_len(item) : 0, is_del); /* rewalk and split if ins/update needs room */ - if (!is_del && !mid_free_item_room(bt, mpos->val_len)) { + if (!is_del && !delta && !mid_free_item_room(bt, mpos->val_len)) { walk_flags |= BTW_INSERT; walk_val_len = mpos->val_len; break; @@ -2267,13 +2366,31 @@ int scoutfs_btree_merge(struct super_block *sb, } /* update existing items */ - if (item && !is_del) { + if (item && !is_del && !delta) { item->seq = cpu_to_le64(mpos->seq); item->flags = mpos->flags; update_item_value(bt, item, mpos->val, mpos->val_len); scoutfs_inc_counter(sb, btree_merge_update); } + /* update combined delta item seq */ + if (delta == SCOUTFS_DELTA_COMBINED) { + item->seq = cpu_to_le64(mpos->seq); + } + + /* + * combined delta items that aren't needed are + * immediately dropped. We don't back off if + * the deletion would fall under the low water + * mark because we've already modified the + * value, we don't want to retry after a join + * and apply the value a second time. + */ + if (delta == SCOUTFS_DELTA_COMBINED_NULL) { + delete_item(bt, item, NULL); + scoutfs_inc_counter(sb, btree_merge_delta_null); + } + /* delete if merge item was deletion */ if (item && is_del) { /* rewalk and join if non-root falls under low water mark */ @@ -2293,12 +2410,9 @@ int scoutfs_btree_merge(struct super_block *sb, /* finished with this key, skip any older items */ next = *mpos->key; scoutfs_key_inc(&next); - while (mpos && scoutfs_key_compare(mpos->key, &next) < 0) { - ret = reset_mpos(sb, &pos_root, mpos, &next, end); - if (ret < 0) - goto out; - mpos = first_mpos(&pos_root); - } + ret = reset_mpos(sb, &pos_root, mpos, &next, end); + if (ret < 0) + goto out; } } diff --git a/kmod/src/counters.h b/kmod/src/counters.h index 0e7db927..234c489d 100644 --- a/kmod/src/counters.h +++ b/kmod/src/counters.h @@ -47,6 +47,8 @@ EXPAND_COUNTER(btree_merge) \ EXPAND_COUNTER(btree_merge_alloc_low) \ EXPAND_COUNTER(btree_merge_delete) \ + EXPAND_COUNTER(btree_merge_delta_combined) \ + EXPAND_COUNTER(btree_merge_delta_null) \ EXPAND_COUNTER(btree_merge_dirty_limit) \ EXPAND_COUNTER(btree_merge_drop_old) \ EXPAND_COUNTER(btree_merge_insert) \ @@ -91,6 +93,8 @@ EXPAND_COUNTER(item_clear_dirty) \ EXPAND_COUNTER(item_create) \ EXPAND_COUNTER(item_delete) \ + EXPAND_COUNTER(item_delta) \ + EXPAND_COUNTER(item_delta_written) \ EXPAND_COUNTER(item_dirty) \ EXPAND_COUNTER(item_invalidate) \ EXPAND_COUNTER(item_invalidate_page) \ @@ -188,6 +192,11 @@ EXPAND_COUNTER(srch_search_xattrs) \ EXPAND_COUNTER(srch_read_stale) \ EXPAND_COUNTER(statfs) \ + EXPAND_COUNTER(totl_read_copied) \ + EXPAND_COUNTER(totl_read_finalized) \ + EXPAND_COUNTER(totl_read_fs) \ + EXPAND_COUNTER(totl_read_item) \ + EXPAND_COUNTER(totl_read_logged) \ EXPAND_COUNTER(trans_commit_data_alloc_low) \ EXPAND_COUNTER(trans_commit_dirty_meta_full) \ EXPAND_COUNTER(trans_commit_fsync) \ diff --git a/kmod/src/forest.c b/kmod/src/forest.c index 65cf26f0..6890fbd7 100644 --- a/kmod/src/forest.c +++ b/kmod/src/forest.c @@ -26,6 +26,7 @@ #include "hash.h" #include "srch.h" #include "counters.h" +#include "xattr.h" #include "scoutfs_trace.h" /* @@ -221,7 +222,7 @@ out: } struct forest_read_items_data { - bool is_fs; + int fic; scoutfs_forest_item_cb cb; void *cb_arg; }; @@ -231,7 +232,7 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key, u6 { struct forest_read_items_data *rid = arg; - return rid->cb(sb, key, seq, flags, val, val_len, rid->cb_arg); + return rid->cb(sb, key, seq, flags, val, val_len, rid->fic, rid->cb_arg); } /* @@ -247,8 +248,8 @@ static int forest_read_items(struct super_block *sb, struct scoutfs_key *key, u6 * to reset their state and retry with a newer version of the btrees. */ int scoutfs_forest_read_items(struct super_block *sb, - struct scoutfs_lock *lock, struct scoutfs_key *key, + struct scoutfs_key *bloom_key, struct scoutfs_key *start, struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg) @@ -264,11 +265,13 @@ int scoutfs_forest_read_items(struct super_block *sb, SCOUTFS_BTREE_ITEM_REF(iref); struct scoutfs_block *bl; struct scoutfs_key ltk; + struct scoutfs_key orig_start = *start; + struct scoutfs_key orig_end = *end; int ret; int i; scoutfs_inc_counter(sb, forest_read_items); - calc_bloom_nrs(&bloom, &lock->start); + calc_bloom_nrs(&bloom, bloom_key); ret = scoutfs_client_get_roots(sb, &roots); if (ret) @@ -276,16 +279,16 @@ int scoutfs_forest_read_items(struct super_block *sb, trace_scoutfs_forest_using_roots(sb, &roots.fs_root, &roots.logs_root); - *start = lock->start; - *end = lock->end; + *start = orig_start; + *end = orig_end; /* start with fs root items */ - rid.is_fs = true; + rid.fic |= FIC_FS_ROOT; ret = scoutfs_btree_read_items(sb, &roots.fs_root, key, start, end, forest_read_items, &rid); if (ret < 0) goto out; - rid.is_fs = false; + rid.fic &= ~FIC_FS_ROOT; scoutfs_key_init_log_trees(<k, 0, 0); for (;; scoutfs_key_inc(<k)) { @@ -330,10 +333,15 @@ int scoutfs_forest_read_items(struct super_block *sb, scoutfs_inc_counter(sb, forest_bloom_pass); + if ((le64_to_cpu(lt.flags) & SCOUTFS_LOG_TREES_FINALIZED)) + rid.fic |= FIC_FINALIZED; + ret = scoutfs_btree_read_items(sb, <.item_root, key, start, end, forest_read_items, &rid); if (ret < 0) goto out; + + rid.fic &= ~FIC_FINALIZED; } ret = 0; @@ -341,6 +349,24 @@ out: return ret; } +/* + * If the items are deltas then combine the src with the destination + * value and store the result in the destination. + * + * Returns: + * -errno: fatal error, no change + * 0: not delta items, no change + * +ve: SCOUTFS_DELTA_ values indicating when dst and/or src can be dropped + */ +int scoutfs_forest_combine_deltas(struct scoutfs_key *key, void *dst, int dst_len, + void *src, int src_len) +{ + if (key->sk_zone == SCOUTFS_XATTR_TOTL_ZONE) + return scoutfs_xattr_combine_totl(dst, dst_len, src, src_len); + + return 0; +} + /* * Make sure that the bloom bits for the lock's start key are all set in * the current log's bloom block. We record the nr of our log tree in diff --git a/kmod/src/forest.h b/kmod/src/forest.h index 1d95b038..8084731f 100644 --- a/kmod/src/forest.h +++ b/kmod/src/forest.h @@ -8,14 +8,18 @@ struct scoutfs_block; #include "btree.h" /* caller gives an item to the callback */ +enum { + FIC_FS_ROOT = (1 << 0), + FIC_FINALIZED = (1 << 1), +}; typedef int (*scoutfs_forest_item_cb)(struct super_block *sb, struct scoutfs_key *key, u64 seq, - u8 flags, void *val, int val_len, void *arg); + u8 flags, void *val, int val_len, int fic, void *arg); int scoutfs_forest_next_hint(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_key *next); int scoutfs_forest_read_items(struct super_block *sb, - struct scoutfs_lock *lock, struct scoutfs_key *key, + struct scoutfs_key *bloom_key, struct scoutfs_key *start, struct scoutfs_key *end, scoutfs_forest_item_cb cb, void *arg); @@ -36,6 +40,12 @@ void scoutfs_forest_init_btrees(struct super_block *sb, void scoutfs_forest_get_btrees(struct super_block *sb, struct scoutfs_log_trees *lt); +/* > 0 error codes */ +#define SCOUTFS_DELTA_COMBINED 1 /* src val was combined, drop src */ +#define SCOUTFS_DELTA_COMBINED_NULL 2 /* combined val has no data, drop both */ +int scoutfs_forest_combine_deltas(struct scoutfs_key *key, void *dst, int dst_len, + void *src, int src_len); + int scoutfs_forest_setup(struct super_block *sb); void scoutfs_forest_start(struct super_block *sb); void scoutfs_forest_stop(struct super_block *sb); diff --git a/kmod/src/format.h b/kmod/src/format.h index 8d9475b7..196cd4fb 100644 --- a/kmod/src/format.h +++ b/kmod/src/format.h @@ -168,6 +168,11 @@ struct scoutfs_key { #define sko_rid _sk_first #define sko_ino _sk_second +/* xattr totl */ +#define skxt_a _sk_first +#define skxt_b _sk_second +#define skxt_c _sk_third + /* inode */ #define ski_ino _sk_first @@ -568,8 +573,9 @@ struct scoutfs_log_merge_freeing { */ #define SCOUTFS_INODE_INDEX_ZONE 1 #define SCOUTFS_ORPHAN_ZONE 2 -#define SCOUTFS_FS_ZONE 3 -#define SCOUTFS_LOCK_ZONE 4 +#define SCOUTFS_XATTR_TOTL_ZONE 3 +#define SCOUTFS_FS_ZONE 4 +#define SCOUTFS_LOCK_ZONE 5 /* Items only stored in server btrees */ #define SCOUTFS_LOG_TREES_ZONE 6 #define SCOUTFS_TRANS_SEQ_ZONE 7 @@ -633,6 +639,17 @@ struct scoutfs_xattr { __u8 name[]; }; +/* + * .totl. xattrs are mapped to items. The dotted u64s in the xattr name + * map to the item key. The item value total is the sum of all the + * xattr values. The item value count records the number of xattrs + * contributing to the total and is used when combining logged items to + * determine if totals are being created or destroyed. + */ +struct scoutfs_xattr_totl_val { + __le64 total; + __le64 count; +}; /* XXX does this exist upstream somewhere? */ #define member_sizeof(TYPE, MEMBER) (sizeof(((TYPE *)0)->MEMBER)) @@ -883,6 +900,7 @@ enum scoutfs_dentry_type { #define SCOUTFS_XATTR_MAX_NAME_LEN 255 #define SCOUTFS_XATTR_MAX_VAL_LEN 65535 #define SCOUTFS_XATTR_MAX_PART_SIZE SCOUTFS_MAX_VAL_SIZE +#define SCOUTFS_XATTR_MAX_TOTL_U64 23 /* octal U64_MAX */ #define SCOUTFS_XATTR_NR_PARTS(name_len, val_len) \ DIV_ROUND_UP(sizeof(struct scoutfs_xattr) + name_len + val_len, \ diff --git a/kmod/src/ioctl.c b/kmod/src/ioctl.c index 088daed9..dc3d12da 100644 --- a/kmod/src/ioctl.c +++ b/kmod/src/ioctl.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "format.h" #include "key.h" @@ -39,6 +40,7 @@ #include "srch.h" #include "alloc.h" #include "server.h" +#include "counters.h" #include "scoutfs_trace.h" /* @@ -1041,6 +1043,293 @@ out: return ret; } +struct xattr_total_entry { + struct rb_node node; + struct scoutfs_ioctl_xattr_total xt; + u64 fs_seq; + u64 fs_total; + u64 fs_count; + u64 fin_seq; + u64 fin_total; + s64 fin_count; + u64 log_seq; + u64 log_total; + s64 log_count; +}; + +static int cmp_xt_entry_name(const struct xattr_total_entry *a, + const struct xattr_total_entry *b) + +{ + return scoutfs_cmp_u64s(a->xt.name[0], b->xt.name[0]) ?: + scoutfs_cmp_u64s(a->xt.name[1], b->xt.name[1]) ?: + scoutfs_cmp_u64s(a->xt.name[2], b->xt.name[2]); +} + +/* + * Record the contribution of the three classes of logged items we can + * see: the item in the fs_root, items from finalized log btrees, and + * items from active log btrees. Once we have the full set the caller + * can decide which of the items contribute to the total it sends to the + * user. + */ +static int read_xattr_total_item(struct super_block *sb, struct scoutfs_key *key, + u64 seq, u8 flags, void *val, int val_len, int fic, void *arg) +{ + struct scoutfs_xattr_totl_val *tval = val; + struct xattr_total_entry *ent; + struct xattr_total_entry rd; + struct rb_root *root = arg; + struct rb_node *parent; + struct rb_node **node; + int cmp; + + rd.xt.name[0] = le64_to_cpu(key->skxt_a); + rd.xt.name[1] = le64_to_cpu(key->skxt_b); + rd.xt.name[2] = le64_to_cpu(key->skxt_c); + + /* find entry matching name */ + node = &root->rb_node; + parent = NULL; + cmp = -1; + while (*node) { + parent = *node; + ent = container_of(*node, struct xattr_total_entry, node); + + /* sort merge items by key then newest to oldest */ + cmp = cmp_xt_entry_name(&rd, ent); + if (cmp < 0) + node = &(*node)->rb_left; + else if (cmp > 0) + node = &(*node)->rb_right; + else + break; + } + + /* allocate and insert new node if we need to */ + if (cmp != 0) { + ent = kzalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + memcpy(&ent->xt.name, &rd.xt.name, sizeof(ent->xt.name)); + + rb_link_node(&ent->node, parent, node); + rb_insert_color(&ent->node, root); + } + + if (fic & FIC_FS_ROOT) { + ent->fs_seq = seq; + ent->fs_total = le64_to_cpu(tval->total); + ent->fs_count = le64_to_cpu(tval->count); + } else if (fic & FIC_FINALIZED) { + ent->fin_seq = seq; + ent->fin_total += le64_to_cpu(tval->total); + ent->fin_count += le64_to_cpu(tval->count); + } else { + ent->log_seq = seq; + ent->log_total += le64_to_cpu(tval->total); + ent->log_count += le64_to_cpu(tval->count); + } + + scoutfs_inc_counter(sb, totl_read_item); + + return 0; +} + +/* these are always _safe, node stores next */ +#define for_each_xt_ent(ent, node, root) \ + for (node = rb_first(root); \ + node && (ent = rb_entry(node, struct xattr_total_entry, node), \ + node = rb_next(node), 1); ) + +#define for_each_xt_ent_reverse(ent, node, root) \ + for (node = rb_last(root); \ + node && (ent = rb_entry(node, struct xattr_total_entry, node), \ + node = rb_prev(node), 1); ) + +static void free_xt_ent(struct rb_root *root, struct xattr_total_entry *ent) +{ + rb_erase(&ent->node, root); + kfree(ent); +} + +static void free_all_xt_ents(struct rb_root *root) +{ + struct xattr_total_entry *ent; + struct rb_node *node; + + for_each_xt_ent(ent, node, root) + free_xt_ent(root, ent); +} + +/* + * Starting from the caller's pos_name, copy the names, totals, and + * counts for the .totl. tagged xattrs in the system sorted by their + * name until the user's buffer is full. This only sees xattrs that + * have been committed. It doesn't use locking to force commits and + * block writers so it can be a little bit out of date with respect to + * dirty xattrs in memory across the system. + * + * Our reader has to be careful because the log btree merging code can + * write partial results to the fs_root. This means that a reader can + * see both cases where new finalized logs should be applied to the old + * fs items and where old finalized logs have already been applied to + * the partially merged fs items. Currently active logged items are + * always applied on top of all cases. + * + * These cases are differentiated with a combination of sequence numbers + * in items, the count of contributing xattrs, and a flag + * differentiating finalized and active logged items. This lets us + * recognize all cases, including when finalized logs were merged and + * deleted the fs item. + * + * We're allocating a tracking struct for each totl name we see while + * traversing the item btrees. The forest reader is providing the items + * it finds in leaf blocks that contain the search key. In the worst + * case all of these blocks are full and none of the items overlap. At + * most, figure order a thousand names per mount. But in practice many + * of these factors fall away: leaf blocks aren't fill, leaf items + * overlap, there aren't finalized log btrees, and not all mounts are + * actively changing totals. We're much more likely to only read a + * leaf block's worth of totals that have been long since merged into + * the fs_root. + */ +static long scoutfs_ioc_read_xattr_totals(struct file *file, unsigned long arg) +{ + struct super_block *sb = file_inode(file)->i_sb; + struct scoutfs_ioctl_read_xattr_totals __user *urxt = (void __user *)arg; + struct scoutfs_ioctl_read_xattr_totals rxt; + struct scoutfs_ioctl_xattr_total __user *uxt; + struct xattr_total_entry *ent; + struct scoutfs_key key; + struct scoutfs_key bloom_key; + struct scoutfs_key start; + struct scoutfs_key end; + struct rb_root root = RB_ROOT; + struct rb_node *node; + int count = 0; + int ret; + + if (!(file->f_mode & FMODE_READ)) { + ret = -EBADF; + goto out; + } + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + + if (copy_from_user(&rxt, urxt, sizeof(rxt))) { + ret = -EFAULT; + goto out; + } + uxt = (void __user *)rxt.totals_ptr; + + if ((rxt.totals_ptr & (sizeof(__u64) - 1)) || + (rxt.totals_bytes < sizeof(struct scoutfs_ioctl_xattr_total))) { + ret = -EINVAL; + goto out; + } + + scoutfs_key_set_zeros(&bloom_key); + bloom_key.sk_zone = SCOUTFS_XATTR_TOTL_ZONE; + scoutfs_xattr_init_totl_key(&start, rxt.pos_name); + + while (rxt.totals_bytes >= sizeof(struct scoutfs_ioctl_xattr_total)) { + + scoutfs_key_set_ones(&end); + end.sk_zone = SCOUTFS_XATTR_TOTL_ZONE; + if (scoutfs_key_compare(&start, &end) > 0) + break; + + key = start; + ret = scoutfs_forest_read_items(sb, &key, &bloom_key, &start, &end, + read_xattr_total_item, &root); + if (ret < 0) { + if (ret == -ESTALE) { + free_all_xt_ents(&root); + continue; + } + goto out; + } + + if (RB_EMPTY_ROOT(&root)) + break; + + /* trim totals that fall outside of the consistent range */ + for_each_xt_ent(ent, node, &root) { + scoutfs_xattr_init_totl_key(&key, ent->xt.name); + if (scoutfs_key_compare(&key, &start) < 0) { + free_xt_ent(&root, ent); + } else { + break; + } + } + for_each_xt_ent_reverse(ent, node, &root) { + scoutfs_xattr_init_totl_key(&key, ent->xt.name); + if (scoutfs_key_compare(&key, &end) > 0) { + free_xt_ent(&root, ent); + } else { + break; + } + } + + /* copy resulting unique non-zero totals to userspace */ + for_each_xt_ent(ent, node, &root) { + if (rxt.totals_bytes < sizeof(ent->xt)) + break; + + /* start with the fs item if we have it */ + if (ent->fs_seq != 0) { + ent->xt.total = ent->fs_total; + ent->xt.count = ent->fs_count; + scoutfs_inc_counter(sb, totl_read_fs); + } + + /* apply finalized logs if they're newer or creating */ + if (((ent->fs_seq != 0) && (ent->fin_seq > ent->fs_seq)) || + ((ent->fs_seq == 0) && (ent->fin_count > 0))) { + ent->xt.total += ent->fin_total; + ent->xt.count += ent->fin_count; + scoutfs_inc_counter(sb, totl_read_finalized); + } + + /* always apply active logs which must be newer than fs and finalized */ + if (ent->log_seq > 0) { + ent->xt.total += ent->log_total; + ent->xt.count += ent->log_count; + scoutfs_inc_counter(sb, totl_read_logged); + } + + if (ent->xt.total != 0 || ent->xt.count != 0) { + if (copy_to_user(uxt, &ent->xt, sizeof(ent->xt))) { + ret = -EFAULT; + goto out; + } + + uxt++; + rxt.totals_bytes -= sizeof(ent->xt); + count++; + scoutfs_inc_counter(sb, totl_read_copied); + } + + free_xt_ent(&root, ent); + } + + /* continue after the last possible key read */ + start = end; + scoutfs_key_inc(&start); + } + + ret = 0; +out: + free_all_xt_ents(&root); + + return ret ?: count; +} + long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -1072,6 +1361,8 @@ long scoutfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return scoutfs_ioc_move_blocks(file, arg); case SCOUTFS_IOC_RESIZE_DEVICES: return scoutfs_ioc_resize_devices(file, arg); + case SCOUTFS_IOC_READ_XATTR_TOTALS: + return scoutfs_ioc_read_xattr_totals(file, arg); } return -ENOTTY; diff --git a/kmod/src/ioctl.h b/kmod/src/ioctl.h index 3dffa37e..8b4decf0 100644 --- a/kmod/src/ioctl.h +++ b/kmod/src/ioctl.h @@ -490,4 +490,55 @@ struct scoutfs_ioctl_resize_devices { #define SCOUTFS_IOC_RESIZE_DEVICES \ _IOR(SCOUTFS_IOCTL_MAGIC, 14, struct scoutfs_ioctl_resize_devices) +#define SCOUTFs_IOCTL_XATTR_TOTAL_NAME_NR 3 + +/* + * Copy global totals of .totl. xattr value payloads to the user. This + * only sees xattrs which have been committed and this doesn't force + * commits of dirty data throughout the system. This can be out of sync + * by the amount of xattrs that can be dirty in open transactions that + * are being built throughout the system. + * + * pos_name: The array name of the first total that can be returned. + * The name is derived from the key of the xattrs that contribute to the + * total. For xattrs with a .totl.1.2.3 key, the pos_name[] should be + * {1, 2, 3}. + * + * totals_ptr: An aligned pointer to a buffer that will be filled with + * an array of scoutfs_ioctl_xattr_total structs for each total copied. + * + * totals_bytes: The size of the buffer in bytes. There must be room + * for at least one struct element so that returning 0 can promise that + * there were no more totals to copy after the pos_name. + * + * The number of copied elements is returned and 0 is returned if there + * were no more totals to copy after the pos_name. + * + * In addition to the usual errnos (EIO, EINVAL, EPERM, EFAULT) this + * adds: + * + * EINVAL: The totals_ buffer was not aligned or was not large enough + * for a single struct entry. + */ +struct scoutfs_ioctl_read_xattr_totals { + __u64 pos_name[SCOUTFs_IOCTL_XATTR_TOTAL_NAME_NR]; + __u64 totals_ptr; + __u64 totals_bytes; +}; + +/* + * An individual total that is given to userspace. The total is the + * sum of all the values in the xattr payloads matching the name. The + * count is the number of xattrs, not number of files, contributing to + * the total. + */ +struct scoutfs_ioctl_xattr_total { + __u64 name[SCOUTFs_IOCTL_XATTR_TOTAL_NAME_NR]; + __u64 total; + __u64 count; +}; + +#define SCOUTFS_IOC_READ_XATTR_TOTALS \ + _IOR(SCOUTFS_IOCTL_MAGIC, 15, struct scoutfs_ioctl_read_xattr_totals) + #endif diff --git a/kmod/src/item.c b/kmod/src/item.c index c05198df..7151b380 100644 --- a/kmod/src/item.c +++ b/kmod/src/item.c @@ -139,7 +139,8 @@ struct cached_item { struct list_head dirty_head; unsigned int dirty:1, /* needs to be written */ persistent:1, /* in btrees, needs deletion item */ - deletion:1; /* negative del item for writing */ + deletion:1, /* negative del item for writing */ + delta:1; /* item vales are combined, freed after write */ unsigned int val_len; struct scoutfs_key key; u64 seq; @@ -415,6 +416,7 @@ static struct cached_item *alloc_item(struct cached_page *pg, item->dirty = 0; item->persistent = 0; item->deletion = !!deletion; + item->delta = 0; item->val_len = val_len; item->key = *key; item->seq = seq; @@ -720,6 +722,7 @@ static void move_page_items(struct super_block *sb, } to->persistent = from->persistent; + to->delta = from->delta; erase_item(left, from); } @@ -1353,7 +1356,7 @@ static void del_active_reader(struct item_cache_info *cinf, struct active_reader * don't have to compare seqs. */ static int read_page_item(struct super_block *sb, struct scoutfs_key *key, u64 seq, u8 flags, - void *val, int val_len, void *arg) + void *val, int val_len, int fic, void *arg) { DECLARE_ITEM_CACHE_INFO(sb, cinf); const bool deletion = !!(flags & SCOUTFS_ITEM_FLAG_DELETION); @@ -1480,8 +1483,9 @@ static int read_pages(struct super_block *sb, struct item_cache_info *cinf, /* set active reader seq before reading persistent roots */ add_active_reader(sb, &active); - ret = scoutfs_forest_read_items(sb, lock, key, &start, &end, - read_page_item, &root); + start = lock->start; + end = lock->end; + ret = scoutfs_forest_read_items(sb, key, &lock->start, &start, &end, read_page_item, &root); if (ret < 0) goto out; @@ -2006,6 +2010,77 @@ out: return ret; } +/* + * Add a delta item. Delta items are an incremental change relative to + * the current persistent delta items. We never have to read the + * current items so the caller always writes with write only locks. If + * combining the current delta item and the caller's item results in a + * null we can just drop it, we don't have to emit a deletion item. + */ +int scoutfs_item_delta(struct super_block *sb, struct scoutfs_key *key, + void *val, int val_len, struct scoutfs_lock *lock) +{ + DECLARE_ITEM_CACHE_INFO(sb, cinf); + const u64 seq = item_seq(sb, lock); + struct cached_item *item; + struct cached_page *pg; + struct rb_node **pnode; + struct rb_node *par; + int ret; + + scoutfs_inc_counter(sb, item_delta); + + if ((ret = lock_safe(lock, key, SCOUTFS_LOCK_WRITE_ONLY))) + goto out; + + ret = scoutfs_forest_set_bloom_bits(sb, lock); + if (ret < 0) + goto out; + + ret = get_cached_page(sb, cinf, lock, key, true, true, val_len, &pg); + if (ret < 0) + goto out; + __acquire(pg->rwlock); + + item = item_rbtree_walk(&pg->item_root, key, NULL, &par, &pnode); + if (item) { + if (!item->delta) { + ret = -EIO; + goto unlock; + } + + ret = scoutfs_forest_combine_deltas(key, item->val, item->val_len, val, val_len); + if (ret <= 0) { + if (ret == 0) + ret = -EIO; + goto unlock; + } + + if (ret == SCOUTFS_DELTA_COMBINED) { + item->seq = seq; + mark_item_dirty(sb, cinf, pg, NULL, item); + } else if (ret == SCOUTFS_DELTA_COMBINED_NULL) { + clear_item_dirty(sb, cinf, pg, item); + erase_item(pg, item); + } else { + ret = -EIO; + goto unlock; + } + ret = 0; + } else { + item = alloc_item(pg, key, seq, false, val, val_len); + rbtree_insert(&item->node, par, pnode, &pg->item_root); + mark_item_dirty(sb, cinf, pg, NULL, item); + item->delta = 1; + ret = 0; + } + +unlock: + write_unlock(&pg->rwlock); +out: + return ret; +} + /* * Delete an item from the cache. We can leave behind a dirty deletion * item if there is a persistent item that needs to be overwritten. @@ -2280,8 +2355,11 @@ retry: dirty_head) { clear_item_dirty(sb, cinf, pg, item); + if (item->delta) + scoutfs_inc_counter(sb, item_delta_written); + /* free deletion items */ - if (item->deletion) + if (item->deletion || item->delta) erase_item(pg, item); else item->persistent = 1; diff --git a/kmod/src/item.h b/kmod/src/item.h index ae4046e7..431866d5 100644 --- a/kmod/src/item.h +++ b/kmod/src/item.h @@ -18,6 +18,8 @@ int scoutfs_item_create_force(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_lock *lock); int scoutfs_item_update(struct super_block *sb, struct scoutfs_key *key, void *val, int val_len, struct scoutfs_lock *lock); +int scoutfs_item_delta(struct super_block *sb, struct scoutfs_key *key, + void *val, int val_len, struct scoutfs_lock *lock); int scoutfs_item_delete(struct super_block *sb, struct scoutfs_key *key, struct scoutfs_lock *lock); int scoutfs_item_delete_force(struct super_block *sb, diff --git a/kmod/src/lock.c b/kmod/src/lock.c index 41479ded..ca674c8f 100644 --- a/kmod/src/lock.c +++ b/kmod/src/lock.c @@ -1237,6 +1237,20 @@ int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int return lock_key_range(sb, mode, flags, &start, &end, lock); } +int scoutfs_lock_xattr_totl(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, + struct scoutfs_lock **lock) +{ + struct scoutfs_key start; + struct scoutfs_key end; + + scoutfs_key_set_zeros(&start); + start.sk_zone = SCOUTFS_XATTR_TOTL_ZONE; + scoutfs_key_set_ones(&end); + end.sk_zone = SCOUTFS_XATTR_TOTL_ZONE; + + return lock_key_range(sb, mode, flags, &start, &end, lock); +} + void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, enum scoutfs_lock_mode mode) { DECLARE_LOCK_INFO(sb, linfo); diff --git a/kmod/src/lock.h b/kmod/src/lock.h index 71b65464..5d7a3ce7 100644 --- a/kmod/src/lock.h +++ b/kmod/src/lock.h @@ -84,6 +84,8 @@ int scoutfs_lock_rename(struct super_block *sb, enum scoutfs_lock_mode mode, int struct scoutfs_lock **lock); int scoutfs_lock_orphan(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, u64 ino, struct scoutfs_lock **lock); +int scoutfs_lock_xattr_totl(struct super_block *sb, enum scoutfs_lock_mode mode, int flags, + struct scoutfs_lock **lock); void scoutfs_unlock(struct super_block *sb, struct scoutfs_lock *lock, enum scoutfs_lock_mode mode); diff --git a/kmod/src/xattr.c b/kmod/src/xattr.c index fd8acd8e..50bd4d45 100644 --- a/kmod/src/xattr.c +++ b/kmod/src/xattr.c @@ -97,6 +97,7 @@ static int unknown_prefix(const char *name) #define HIDE_TAG "hide." #define SRCH_TAG "srch." +#define TOTL_TAG "totl." #define TAG_LEN (sizeof(HIDE_TAG) - 1) int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len, @@ -119,6 +120,9 @@ int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len, } else if (!strncmp(name, SRCH_TAG, TAG_LEN)) { if (++tgs->srch == 0) return -EINVAL; + } else if (!strncmp(name, TOTL_TAG, TAG_LEN)) { + if (++tgs->totl == 0) + return -EINVAL; } else { /* only reason to use scoutfs. is tags */ if (!found) @@ -468,6 +472,100 @@ out: return ret; } +void scoutfs_xattr_init_totl_key(struct scoutfs_key *key, u64 *name) +{ + scoutfs_key_set_zeros(key); + key->sk_zone = SCOUTFS_XATTR_TOTL_ZONE; + key->skxt_a = cpu_to_le64(name[0]); + key->skxt_b = cpu_to_le64(name[1]); + key->skxt_c = cpu_to_le64(name[2]); +} + +/* + * Parse a u64 in any base after null terminating it while forbidding + * the leading + and trailing \n that kstrotull allows. + */ +static int parse_totl_u64(const char *s, int len, u64 *res) +{ + char str[SCOUTFS_XATTR_MAX_TOTL_U64 + 1]; + + if (len <= 0 || len >= ARRAY_SIZE(str) || s[0] == '+' || s[len - 1] == '\n') + return -EINVAL; + + memcpy(str, s, len); + str[len] = '\0'; + + return kstrtoull(str, 0, res) != 0 ? -EINVAL : 0; +} + +/* + * non-destructive relatively quick parse of the last 3 dotted u64s that + * make up the name of the xattr total. -EINVAL is returned if there + * are anything but 3 valid u64 encodings between single dots at the end + * of the name. + */ +static int parse_totl_key(struct scoutfs_key *key, const char *name, int name_len) +{ + u64 tot_name[3]; + int end = name_len; + int nr = 0; + int len; + int ret; + int i; + + /* parse name elements in reserve order from end of xattr name string */ + for (i = name_len - 1; i >= 0 && nr < ARRAY_SIZE(tot_name); i--) { + if (name[i] != '.') + continue; + + len = end - (i + 1); + ret = parse_totl_u64(&name[i + 1], len, &tot_name[nr]); + if (ret < 0) + goto out; + + end = i; + nr++; + } + + if (nr == ARRAY_SIZE(tot_name)) { + /* swap to account for parsing in reverse */ + swap(tot_name[0], tot_name[2]); + scoutfs_xattr_init_totl_key(key, tot_name); + ret = 0; + } else { + ret = -EINVAL; + } + +out: + return ret; +} + +static int apply_totl_delta(struct super_block *sb, struct scoutfs_key *key, + struct scoutfs_xattr_totl_val *tval, struct scoutfs_lock *lock) +{ + if (tval->total == 0 && tval->count == 0) + return 0; + + return scoutfs_item_delta(sb, key, tval, sizeof(*tval), lock); +} + +int scoutfs_xattr_combine_totl(void *dst, int dst_len, void *src, int src_len) +{ + struct scoutfs_xattr_totl_val *s_tval = src; + struct scoutfs_xattr_totl_val *d_tval = dst; + + if (src_len != sizeof(*s_tval) || dst_len != src_len) + return -EIO; + + le64_add_cpu(&d_tval->total, le64_to_cpu(s_tval->total)); + le64_add_cpu(&d_tval->count, le64_to_cpu(s_tval->count)); + + if (d_tval->total == 0 && d_tval->count == 0) + return SCOUTFS_DELTA_COMBINED_NULL; + + return SCOUTFS_DELTA_COMBINED; +} + /* * The confusing swiss army knife of creating, modifying, and deleting * xattrs. @@ -486,16 +584,22 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name, struct scoutfs_inode_info *si = SCOUTFS_I(inode); struct super_block *sb = inode->i_sb; const u64 ino = scoutfs_ino(inode); + struct scoutfs_xattr_totl_val tval = {0,}; struct scoutfs_xattr_prefix_tags tgs; struct scoutfs_xattr *xat = NULL; struct scoutfs_lock *lck = NULL; + struct scoutfs_lock *totl_lock = NULL; size_t name_len = strlen(name); + struct scoutfs_key totl_key; struct scoutfs_key key; bool undo_srch = false; + bool undo_totl = false; LIST_HEAD(ind_locks); u8 found_parts; unsigned int bytes; + unsigned int val_len; u64 ind_seq; + u64 total; u64 hash = 0; u64 id = 0; int ret; @@ -519,11 +623,15 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name, if (scoutfs_xattr_parse_tags(name, name_len, &tgs) != 0) return -EINVAL; - if ((tgs.hide || tgs.srch) && !capable(CAP_SYS_ADMIN)) + if ((tgs.hide | tgs.srch | tgs.totl) && !capable(CAP_SYS_ADMIN)) return -EPERM; + if (tgs.totl && ((ret = parse_totl_key(&totl_key, name, name_len)) != 0)) + return ret; + bytes = sizeof(struct scoutfs_xattr) + name_len + size; - xat = __vmalloc(bytes, GFP_NOFS, PAGE_KERNEL); + /* alloc enough to read old totl value */ + xat = __vmalloc(bytes + SCOUTFS_XATTR_MAX_TOTL_U64, GFP_NOFS, PAGE_KERNEL); if (!xat) { ret = -ENOMEM; goto out; @@ -536,9 +644,9 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name, down_write(&si->xattr_rwsem); - /* find an existing xattr to delete */ + /* find an existing xattr to delete, including possible totl value */ ret = get_next_xattr(inode, &key, xat, - sizeof(struct scoutfs_xattr) + name_len, + sizeof(struct scoutfs_xattr) + name_len + SCOUTFS_XATTR_MAX_TOTL_U64, name, name_len, 0, 0, lck); if (ret < 0 && ret != -ENOENT) goto unlock; @@ -558,9 +666,23 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name, goto unlock; } + /* s64 count delta if we create or delete */ + if (tgs.totl) + tval.count = cpu_to_le64((u64)!!(value) - (u64)!!(ret != -ENOENT)); + /* found fields in key will also be used */ found_parts = ret >= 0 ? xattr_nr_parts(xat) : 0; + if (found_parts && tgs.totl) { + /* parse old totl value before we clobber xat buf */ + val_len = ret - offsetof(struct scoutfs_xattr, name[xat->name_len]); + ret = parse_totl_u64(&xat->name[xat->name_len], val_len, &total); + if (ret < 0) + goto unlock; + + le64_add_cpu(&tval.total, -total); + } + /* prepare our xattr */ if (value) { if (found_parts) @@ -572,6 +694,20 @@ static int scoutfs_xattr_set(struct dentry *dentry, const char *name, memset(xat->__pad, 0, sizeof(xat->__pad)); memcpy(xat->name, name, name_len); memcpy(&xat->name[xat->name_len], value, size); + + if (tgs.totl) { + ret = parse_totl_u64(value, size, &total); + if (ret < 0) + goto unlock; + } + + le64_add_cpu(&tval.total, total); + } + + if (tgs.totl) { + ret = scoutfs_lock_xattr_totl(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, &totl_lock); + if (ret) + goto unlock; } retry: @@ -597,6 +733,13 @@ retry: undo_srch = true; } + if (tgs.totl) { + ret = apply_totl_delta(sb, &totl_key, &tval, totl_lock); + if (ret < 0) + goto release; + undo_totl = true; + } + if (found_parts && value) ret = change_xattr_items(inode, id, xat, bytes, xattr_nr_parts(xat), found_parts, lck); @@ -620,12 +763,20 @@ release: err = scoutfs_forest_srch_add(sb, hash, ino, id); BUG_ON(err); } + if (ret < 0 && undo_totl) { + /* _delta() on dirty items shouldn't fail */ + tval.total = cpu_to_le64(-le64_to_cpu(tval.total)); + tval.count = cpu_to_le64(-le64_to_cpu(tval.count)); + err = apply_totl_delta(sb, &totl_key, &tval, totl_lock); + BUG_ON(err); + } scoutfs_release_trans(sb); scoutfs_inode_index_unlock(sb, &ind_locks); unlock: up_write(&si->xattr_rwsem); scoutfs_unlock(sb, lck, SCOUTFS_LOCK_WRITE); + scoutfs_unlock(sb, totl_lock, SCOUTFS_LOCK_WRITE_ONLY); out: vfree(xat); @@ -746,15 +897,22 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino, { struct scoutfs_xattr_prefix_tags tgs; struct scoutfs_xattr *xat = NULL; + struct scoutfs_lock *totl_lock = NULL; + struct scoutfs_xattr_totl_val tval; + struct scoutfs_key totl_key; struct scoutfs_key last; struct scoutfs_key key; bool release = false; unsigned int bytes; + unsigned int val_len; + void *value; + u64 total; u64 hash; int ret; - /* need a buffer large enough for all possible names */ - bytes = sizeof(struct scoutfs_xattr) + SCOUTFS_XATTR_MAX_NAME_LEN; + /* need a buffer large enough for all possible names and totl value */ + bytes = sizeof(struct scoutfs_xattr) + SCOUTFS_XATTR_MAX_NAME_LEN + + SCOUTFS_XATTR_MAX_TOTL_U64; xat = kmalloc(bytes, GFP_NOFS); if (!xat) { ret = -ENOMEM; @@ -773,11 +931,37 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino, break; } + if (key.skx_part == 0 && (ret < sizeof(struct scoutfs_xattr) || + ret < offsetof(struct scoutfs_xattr, name[xat->name_len]))) { + ret = -EIO; + break; + } + if (key.skx_part != 0 || scoutfs_xattr_parse_tags(xat->name, xat->name_len, &tgs) != 0) memset(&tgs, 0, sizeof(tgs)); + if (tgs.totl) { + value = &xat->name[xat->name_len]; + val_len = ret - offsetof(struct scoutfs_xattr, name[xat->name_len]); + if (val_len != le16_to_cpu(xat->val_len)) { + ret = -EIO; + goto out; + } + + ret = parse_totl_key(&totl_key, xat->name, xat->name_len) ?: + parse_totl_u64(value, val_len, &total); + if (ret < 0) + break; + } + + if (tgs.totl && totl_lock == NULL) { + ret = scoutfs_lock_xattr_totl(sb, SCOUTFS_LOCK_WRITE_ONLY, 0, &totl_lock); + if (ret < 0) + break; + } + ret = scoutfs_hold_trans(sb, false); if (ret < 0) break; @@ -795,6 +979,14 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino, break; } + if (tgs.totl) { + tval.total = cpu_to_le64(-total); + tval.count = cpu_to_le64(-1LL); + ret = apply_totl_delta(sb, &totl_key, &tval, totl_lock); + if (ret < 0) + break; + } + scoutfs_release_trans(sb); release = false; @@ -803,6 +995,7 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino, if (release) scoutfs_release_trans(sb); + scoutfs_unlock(sb, totl_lock, SCOUTFS_LOCK_WRITE_ONLY); kfree(xat); out: return ret; diff --git a/kmod/src/xattr.h b/kmod/src/xattr.h index 39313801..cbc6c599 100644 --- a/kmod/src/xattr.h +++ b/kmod/src/xattr.h @@ -16,10 +16,14 @@ int scoutfs_xattr_drop(struct super_block *sb, u64 ino, struct scoutfs_xattr_prefix_tags { unsigned long hide:1, - srch:1; + srch:1, + totl:1; }; int scoutfs_xattr_parse_tags(const char *name, unsigned int name_len, struct scoutfs_xattr_prefix_tags *tgs); +void scoutfs_xattr_init_totl_key(struct scoutfs_key *key, u64 *name); +int scoutfs_xattr_combine_totl(void *dst, int dst_len, void *src, int src_len); + #endif diff --git a/tests/golden/totl-xattr-tag b/tests/golden/totl-xattr-tag new file mode 100644 index 00000000..6ed98d2e --- /dev/null +++ b/tests/golden/totl-xattr-tag @@ -0,0 +1,30 @@ +== single file +1.2.3 = 1, 1 +4.5.6 = 1, 1 +== multiple files add up +1.2.3 = 2, 2 +4.5.6 = 2, 2 +== removing xattr updates total +1.2.3 = 2, 2 +4.5.6 = 1, 1 +== updating xattr updates total +1.2.3 = 11, 2 +4.5.6 = 1, 1 +== removing files update total +1.2.3 = 10, 1 +== multiple files/names in one transaction +1.2.3 = 55, 10 +== testing invalid names +setfattr: /mnt/test/test/totl-xattr-tag/invalid: Invalid argument +setfattr: /mnt/test/test/totl-xattr-tag/invalid: Invalid argument +setfattr: /mnt/test/test/totl-xattr-tag/invalid: Invalid argument +setfattr: /mnt/test/test/totl-xattr-tag/invalid: Invalid argument +setfattr: /mnt/test/test/totl-xattr-tag/invalid: Invalid argument +setfattr: /mnt/test/test/totl-xattr-tag/invalid: Invalid argument +== testing invalid values +setfattr: /mnt/test/test/totl-xattr-tag/invalid: Invalid argument +setfattr: /mnt/test/test/totl-xattr-tag/invalid: Invalid argument +setfattr: /mnt/test/test/totl-xattr-tag/invalid: Invalid argument +setfattr: /mnt/test/test/totl-xattr-tag/invalid: Invalid argument +setfattr: /mnt/test/test/totl-xattr-tag/invalid: Invalid argument +== larger population that could merge diff --git a/tests/sequence b/tests/sequence index b1ff9893..146fa047 100644 --- a/tests/sequence +++ b/tests/sequence @@ -10,6 +10,7 @@ move-blocks.sh enospc.sh srch-basic-functionality.sh simple-xattr-unit.sh +totl-xattr-tag.sh lock-refleak.sh lock-shrink-consistency.sh lock-pr-cw-conflict.sh diff --git a/tests/tests/totl-xattr-tag.sh b/tests/tests/totl-xattr-tag.sh new file mode 100644 index 00000000..dd2d90b7 --- /dev/null +++ b/tests/tests/totl-xattr-tag.sh @@ -0,0 +1,126 @@ +t_require_commands touch rm setfattr scoutfs find_xattrs + +read_xattr_totals() +{ + sync + scoutfs read-xattr-totals -p "$T_M0" +} + +echo "== single file" +touch "$T_D0/file-1" +setfattr -n scoutfs.totl.test.1.2.3 -v 1 "$T_D0/file-1" 2>&1 | t_filter_fs +setfattr -n scoutfs.totl.test.4.5.6 -v 1 "$T_D0/file-1" 2>&1 | t_filter_fs +read_xattr_totals + +echo "== multiple files add up" +touch "$T_D0/file-2" +setfattr -n scoutfs.totl.test.1.2.3 -v 1 "$T_D0/file-2" 2>&1 | t_filter_fs +setfattr -n scoutfs.totl.test.4.5.6 -v 1 "$T_D0/file-2" 2>&1 | t_filter_fs +read_xattr_totals + +echo "== removing xattr updates total" +setfattr -x scoutfs.totl.test.4.5.6 "$T_D0/file-2" 2>&1 | t_filter_fs +read_xattr_totals + +echo "== updating xattr updates total" +setfattr -n scoutfs.totl.test.1.2.3 -v 10 "$T_D0/file-2" 2>&1 | t_filter_fs +read_xattr_totals + +echo "== removing files update total" +rm -f "$T_D0/file-1" +read_xattr_totals +rm -f "$T_D0/file-2" +read_xattr_totals + +echo "== multiple files/names in one transaction" +for a in $(seq 1 10); do + touch "$T_D0/file-$a" + setfattr -n scoutfs.totl.test.1.2.3 -v $a "$T_D0/file-$a" 2>&1 | t_filter_fs +done +read_xattr_totals +rm -rf "$T_D0"/file-[0-9]* + +echo "== testing invalid names" +touch "$T_D0/invalid" +setfattr -n scoutfs.totl.test... -v 10 "$T_D0/invalid" 2>&1 | t_filter_fs +setfattr -n scoutfs.totl.test..2.3 -v 10 "$T_D0/invalid" 2>&1 | t_filter_fs +setfattr -n scoutfs.totl.test.1..3 -v 10 "$T_D0/invalid" 2>&1 | t_filter_fs +setfattr -n scoutfs.totl.test.1.2. -v 10 "$T_D0/invalid" 2>&1 | t_filter_fs +setfattr -n scoutfs.totl.test.1 -v 10 "$T_D0/invalid" 2>&1 | t_filter_fs +setfattr -n scoutfs.totl.test.1.2 -v 10 "$T_D0/invalid" 2>&1 | t_filter_fs + +echo "== testing invalid values" +setfattr -n scoutfs.totl.test.1.2.3 -v "+1" "$T_D0/invalid" 2>&1 | t_filter_fs +setfattr -n scoutfs.totl.test.1.2.3 -v "10." "$T_D0/invalid" 2>&1 | t_filter_fs +setfattr -n scoutfs.totl.test.1.2.3 -v "-" "$T_D0/invalid" 2>&1 | t_filter_fs +setfattr -n scoutfs.totl.test.1.2.3 -v "junk10" "$T_D0/invalid" 2>&1 | t_filter_fs +setfattr -n scoutfs.totl.test.1.2.3 -v "10junk" "$T_D0/invalid" 2>&1 | t_filter_fs +rm -f "$T_D0/invalid" + +echo "== larger population that could merge" +NR=5000 +TOTS=100 +CHECK=100 +PER_DIR=1000 +PER_FILE=10 + +declare -A totals counts +LOTS="$T_D0/lots" + +for i in $(seq 0 $PER_DIR $NR); do + p="$LOTS/$((i / PER_DIR))" + mkdir -p $p +done +for i in $(seq 0 $PER_FILE $NR); do + p="$LOTS/$((i / PER_DIR))/file-$((i / PER_FILE))" + touch $p +done + +for phase in create update remove; do + for i in $(seq 0 $NR); do + p="$LOTS/$((i / PER_DIR))/file-$((i / PER_FILE))" + + t=$((i % TOTS)) + n="scoutfs.totl.test-$i.$t.0.0" + + case $phase in + create) + v="$i" + setfattr -n "$n" -v "$v" "$p" 2>&1 >> $T_TMP.sfa + ((totals[$t]+=$v)) + ((counts[$t]++)) + ;; + update) + v=$((i * 3)) + delta=$((i * 2)) + setfattr -n "$n" -v "$v" "$p" 2>&1 >> $T_TMP.sfa + ((totals[$t]+=$delta)) + ;; + remove) + v=$((i * 3)) + setfattr -x "$n" "$p" 2>&1 >> $T_TMP.sfa + ((totals[$t]-=$v)) + ((counts[$t]--)) + ;; + esac + + if [ "$i" -gt 0 -a "$((i % CHECK))" == "0" ]; then + echo "checking $phase $i" > $T_TMP.check_arr + echo "checking $phase $i" > $T_TMP.check_read + + ( for k in ${!totals[@]}; do + echo "$k.0.0 = ${totals[$k]}, ${counts[$k]}" + done ) | grep -v "= 0, 0$" | sort -n >> $T_TMP.check_arr + + sync + read_xattr_totals | sort -n >> $T_TMP.check_read + + diff -u $T_TMP.check_arr $T_TMP.check_read || \ + t_fail "totals read didn't match expected arrays" + fi + done +done + +rm -rf "$T_D0/merging" + +t_pass diff --git a/utils/man/scoutfs.5 b/utils/man/scoutfs.5 index 150eb1fb..63962735 100644 --- a/utils/man/scoutfs.5 +++ b/utils/man/scoutfs.5 @@ -142,6 +142,62 @@ If the file is written to then the server cannot make forward progress and shuts down. The request can similarly enter an errored state if enough time passes before userspace completes the request. + +.SH EXTENDED ATTRIBUTE TAGS + +.B scoutfs +adds the +.IB scoutfs. +extended attribute namespace which uses a system of tags to extend the +functionality of extended attributes. Immediately following the +scoutfs. prefix are a series of tag words seperated by dots. +Any text starting after the last recognized tag is considered the xattr +name and is not parsed. +.sp +Tags may be combined in any order. Specifying a tag more than once +will return an error. There is no explicit boundary between the end of +tags and the start of the name so unknown or incorrect tags will be +successfully parsed as part of the name of the xattr. Tags can only be +created, updated, or removed with the CAP_SYS_ADMIN capability. + +The following tags are currently supported: + +.RS +.TP +.B .hide. +Attributes with the .hide. tag are not visible to the +.BR listxattr(2) +system call. They will instead be included in the output of the +.IB LISTXATTR_HIDDEN +ioctl. This is meant to be used by archival management agents to store +metadata that is bound to a specific volume and should not be +transferred with the file by tools that read extended attributes, like +.BR tar(1) . +.TP +.B .srch. +Attributes with the .srch. tag are indexed so that they can be +found by the +.IB SEARCH_XATTRS +ioctl. The search ioctl takes an extended attribute name and returns +the inode number of all the inodes which contain an extended attribute +with that name. The indexing structures behind .srch. tags are designed +to efficiently handle a large number of .srch. attributes per file with +no limits on the number of indexed files. +.TP +.B .totl. +Attributes with the .totl. flag are used to efficiently maintain counts +across all files in the system. The attribute's name must end in three +64bit values seperated by dots that specify the global total that the +extended attribute will contribute to. The value of the extended +attribute is a string representation of the 64bit quantity which will be +added to the total. As attributes are added, updated, or removed (and +particularly as a file is finally deleted), the corresponding global +total is also updated by the file system. All the totals with their +name, total value, and a count of contributing attributes can be read +with the +.IB READ_XATTR_TOTALS +ioctl. +.RE .SH CORRUPTION DETECTION A diff --git a/utils/src/print.c b/utils/src/print.c index efcb4f85..a7688cc4 100644 --- a/utils/src/print.c +++ b/utils/src/print.c @@ -75,6 +75,17 @@ static void print_orphan(struct scoutfs_key *key, void *val, int val_len) printf(" orphan: ino %llu\n", le64_to_cpu(key->sko_ino)); } + +static void print_xattr_totl(struct scoutfs_key *key, void *val, int val_len) +{ + struct scoutfs_xattr_totl_val *tval = val; + + printf(" xattr totl: %llu.%llu.%llu = %lld, %lld\n", + le64_to_cpu(key->skxt_a), le64_to_cpu(key->skxt_b), + le64_to_cpu(key->skxt_c), le64_to_cpu(tval->total), + le64_to_cpu(tval->count)); +} + static u8 *global_printable_name(u8 *name, int name_len) { static u8 name_buf[SCOUTFS_NAME_LEN + 1]; @@ -163,6 +174,9 @@ static print_func_t find_printer(u8 zone, u8 type) return print_orphan; } + if (zone == SCOUTFS_XATTR_TOTL_ZONE) + return print_xattr_totl; + if (zone == SCOUTFS_FS_ZONE) { switch(type) { case SCOUTFS_INODE_TYPE: return print_inode; diff --git a/utils/src/read_xattr_totals.c b/utils/src/read_xattr_totals.c new file mode 100644 index 00000000..d835e508 --- /dev/null +++ b/utils/src/read_xattr_totals.c @@ -0,0 +1,120 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sparse.h" +#include "parse.h" +#include "util.h" +#include "format.h" +#include "ioctl.h" +#include "cmd.h" + +struct xattr_args { + char *path; +}; + +static int do_read_xattr_totals(struct xattr_args *args) +{ + struct scoutfs_ioctl_read_xattr_totals rxt; + struct scoutfs_ioctl_xattr_total *xts = NULL; + struct scoutfs_ioctl_xattr_total *xt; + u64 bytes = 1024 * 1024; + int fd = -1; + int ret; + int i; + + xts = malloc(bytes); + if (!xts) { + fprintf(stderr, "xattr total mem alloc failed\n"); + ret = -ENOMEM; + goto out; + } + + fd = get_path(args->path, O_RDONLY); + if (fd < 0) + return fd; + + memset(&rxt, 0, sizeof(rxt)); + rxt.totals_ptr = (unsigned long)xts; + rxt.totals_bytes = bytes; + + for (;;) { + ret = ioctl(fd, SCOUTFS_IOC_READ_XATTR_TOTALS, &rxt); + if (ret == 0) + break; + if (ret < 0) { + ret = -errno; + fprintf(stderr, "read_xattr_totals ioctl failed: " + "%s (%d)\n", strerror(errno), errno); + goto out; + } + + for (i = 0, xt = xts; i < ret; i++, xt++) + printf("%llu.%llu.%llu = %lld, %lld\n", + xt->name[0], xt->name[1], xt->name[2], xt->total, xt->count); + + memcpy(&rxt.pos_name, &xts[ret - 1].name, sizeof(rxt.pos_name)); + if (++rxt.pos_name[2] == 0 && ++rxt.pos_name[1] == 0 && ++rxt.pos_name[0] == 0) + break; + } + + ret = 0; +out: + if (fd >= 0) + close(fd); + free(xts); + + return ret; +}; + +static int parse_opt(int key, char *arg, struct argp_state *state) +{ + struct xattr_args *args = state->input; + + switch (key) { + case 'p': + args->path = strdup_or_error(state, arg); + break; + default: + break; + } + + return 0; +} + +static struct argp_option options[] = { + { "path", 'p', "PATH", 0, "Path to ScoutFS filesystem"}, + { NULL } +}; + +static struct argp argp = { + options, + parse_opt, + "", + "Print global value totals of .totl. xattrs" +}; + +static int read_xattr_totals_cmd(int argc, char **argv) +{ + + struct xattr_args xattr_args = {NULL}; + int ret; + + ret = argp_parse(&argp, argc, argv, 0, NULL, &xattr_args); + if (ret) + return ret; + + return do_read_xattr_totals(&xattr_args); +} + +static void __attribute__((constructor)) read_xattr_totals_ctor(void) +{ + cmd_register_argp("read-xattr-totals", &argp, GROUP_INFO, read_xattr_totals_cmd); +}