Don't emit empty blocks in kway_merge()

It's possible for a srch compaction to collapse down to nothing
if given evenly paired create/delete entries. In this case, we
were emitting an empty block. This could cause problems for
search_sorted_file(), which assumes that every block it sees
has a valid first and last entry.

Fix this by keeping a temp entry and only emitting it if it differs
from the next entry in the block. Be sure to flush out a straggling
temp entry if we have one when we're done with the last block of
the merge.

Signed-off-by: Chris Kirby <ckirby@versity.com>
This commit is contained in:
Chris Kirby
2026-01-14 15:03:02 -06:00
parent c9575cda2e
commit 2bebab4873
2 changed files with 241 additions and 82 deletions

View File

@@ -2660,6 +2660,110 @@ TRACE_EVENT(scoutfs_get_file_block,
__entry->last_hash, __entry->last_ino, __entry->last_id)
);
TRACE_EVENT(scoutfs_srch_new_merge,
TP_PROTO(struct super_block *sb),
TP_ARGS(sb),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
),
TP_printk(SCSBF, SCSB_TRACE_ARGS)
);
TRACE_EVENT(scoutfs_srch_emit_entry,
TP_PROTO(struct super_block *sb, struct scoutfs_srch_entry *sre,
struct scoutfs_srch_block *srb, u64 blkno),
TP_ARGS(sb, sre, srb, blkno),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u32, entry_nr)
__field(__u64, blkno)
__field(__u64, hash)
__field(__u64, ino)
__field(__u64, id)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->entry_nr = __le32_to_cpu(srb->entry_nr);
__entry->blkno = blkno;
__entry->hash = __le64_to_cpu(sre->hash);
__entry->ino = __le64_to_cpu(sre->ino);
__entry->id = __le64_to_cpu(sre->id);
),
TP_printk(SCSBF" nr %u blkno %llu hash 0x%llx ino %llu id 0x%llx",
SCSB_TRACE_ARGS, __entry->entry_nr, __entry->blkno,
__entry->hash, __entry->ino, __entry->id)
);
TRACE_EVENT(scoutfs_srch_clr_tmp,
TP_PROTO(struct super_block *sb, struct scoutfs_srch_entry *tmp),
TP_ARGS(sb, tmp),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, tmp_hash)
__field(__u64, tmp_ino)
__field(__u64, tmp_id)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->tmp_hash = __le64_to_cpu(tmp->hash);
__entry->tmp_ino = __le64_to_cpu(tmp->ino);
__entry->tmp_id = __le64_to_cpu(tmp->id);
),
TP_printk(SCSBF" tmp hash 0x%llx tmp ino %llu tmp hash 0x%llx",
SCSB_TRACE_ARGS,
__entry->tmp_hash, __entry->tmp_ino, __entry->tmp_id)
);
TRACE_EVENT(scoutfs_srch_cmp,
TP_PROTO(struct super_block *sb, struct scoutfs_srch_entry *root,
struct scoutfs_srch_entry *tmp, void *bl),
TP_ARGS(sb, root, tmp, bl),
TP_STRUCT__entry(
SCSB_TRACE_FIELDS
__field(__u64, root_hash)
__field(__u64, root_ino)
__field(__u64, root_id)
__field(__u64, tmp_hash)
__field(__u64, tmp_ino)
__field(__u64, tmp_id)
__field(void *, bl)
),
TP_fast_assign(
SCSB_TRACE_ASSIGN(sb);
__entry->root_hash = __le64_to_cpu(root->hash);
__entry->root_ino = __le64_to_cpu(root->ino);
__entry->root_id = __le64_to_cpu(root->id);
__entry->tmp_hash = __le64_to_cpu(tmp->hash);
__entry->tmp_ino = __le64_to_cpu(tmp->ino);
__entry->tmp_id = __le64_to_cpu(tmp->id);
__entry->bl = bl;
),
TP_printk(SCSBF" root hash 0x%llx root ino %llu root id 0x%llx tmp hash 0x%llx tmp ino %llu tmp hash 0x%llx, bl %p",
SCSB_TRACE_ARGS,
__entry->root_hash, __entry->root_ino, __entry->root_id,
__entry->tmp_hash, __entry->tmp_ino, __entry->tmp_id,
__entry->bl)
);
TRACE_EVENT(scoutfs_block_stale,
TP_PROTO(struct super_block *sb, struct scoutfs_block_ref *ref,
struct scoutfs_block_header *hdr, u32 magic, u32 crc),

View File

@@ -1525,6 +1525,66 @@ static bool should_commit(struct super_block *sb, struct scoutfs_alloc *alloc,
scoutfs_alloc_meta_low(sb, alloc, nr);
}
static int alloc_srch_block(struct super_block *sb, struct scoutfs_alloc *alloc,
struct scoutfs_block_writer *wri,
struct scoutfs_srch_file *sfl,
struct scoutfs_block **bl,
u64 blk)
{
DECLARE_SRCH_INFO(sb, srinf);
int ret;
if (atomic_read(&srinf->shutdown))
return -ESHUTDOWN;
/* could grow and dirty to a leaf */
if (should_commit(sb, alloc, wri, sfl->height + 1))
return -EAGAIN;
ret = get_file_block(sb, alloc, wri, sfl, GFB_INSERT | GFB_DIRTY,
blk, bl);
if (ret < 0)
return ret;
scoutfs_inc_counter(sb, srch_compact_dirty_block);
return 0;
}
static int emit_srch_entry(struct super_block *sb,
struct scoutfs_srch_file *sfl,
struct scoutfs_srch_block *srb,
struct scoutfs_srch_entry *sre,
u64 blk)
{
int ret;
ret = encode_entry(srb->entries + le32_to_cpu(srb->entry_bytes),
sre, &srb->tail);
if (WARN_ON_ONCE(ret <= 0)) {
/* shouldn't happen */
return -EIO;
}
if (srb->entry_bytes == 0) {
if (blk == 0)
sfl->first = *sre;
srb->first = *sre;
}
le32_add_cpu(&srb->entry_nr, 1);
le32_add_cpu(&srb->entry_bytes, ret);
srb->last = *sre;
srb->tail = *sre;
sfl->last = *sre;
le64_add_cpu(&sfl->entries, 1);
scoutfs_inc_counter(sb, srch_compact_entry);
trace_scoutfs_srch_emit_entry(sb, sre, srb, blk);
return 0;
}
struct tourn_node {
struct scoutfs_srch_entry sre;
int ind;
@@ -1559,20 +1619,18 @@ static int kway_merge(struct super_block *sb,
kway_get_t kway_get, kway_advance_t kway_adv,
void **args, int nr, bool logs_input)
{
DECLARE_SRCH_INFO(sb, srinf);
struct scoutfs_srch_block *srb = NULL;
struct scoutfs_srch_entry last_tail;
struct scoutfs_srch_entry tmp_entry = {0};
struct scoutfs_block *bl = NULL;
struct tourn_node *tnodes;
struct tourn_node *leaves;
struct tourn_node *root;
struct tourn_node *tn;
int last_bytes = 0;
bool have_tmp = false;
int nr_parents;
int nr_nodes;
int empty = 0;
int ret = 0;
int diff;
u64 blk;
int ind;
int i;
@@ -1606,97 +1664,73 @@ static int kway_merge(struct super_block *sb,
}
}
trace_scoutfs_srch_new_merge(sb);
/* always append new blocks */
blk = le64_to_cpu(sfl->blocks);
while (empty < nr) {
if (bl == NULL) {
if (atomic_read(&srinf->shutdown)) {
ret = -ESHUTDOWN;
goto out;
trace_scoutfs_srch_cmp(sb, &root->sre, &tmp_entry, bl);
if (sre_cmp(&root->sre, &tmp_entry) != 0) {
if (have_tmp) {
if (bl == NULL) {
ret = alloc_srch_block(sb, alloc, wri,
sfl, &bl, blk);
if (ret < 0) {
if (ret == -EAGAIN)
ret = 0;
goto out;
}
srb = bl->data;
}
ret = emit_srch_entry(sb, sfl, srb, &tmp_entry,
blk);
if (ret < 0)
goto out;
if (le32_to_cpu(srb->entry_bytes) >
SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
scoutfs_block_put(sb, bl);
bl = NULL;
blk++;
memset(&tmp_entry, 0, sizeof(tmp_entry));
have_tmp = false;
continue;
}
/*
* end sorted block on _SAFE offset for
* testing
*/
if (bl && le32_to_cpu(srb->entry_nr) == 1 &&
logs_input &&
scoutfs_trigger(sb, SRCH_COMPACT_LOGS_PAD_SAFE)) {
pad_entries_at_safe(sfl, srb);
scoutfs_block_put(sb, bl);
bl = NULL;
blk++;
memset(&tmp_entry, 0, sizeof(tmp_entry));
have_tmp = false;
continue;
}
}
/* could grow and dirty to a leaf */
if (should_commit(sb, alloc, wri, sfl->height + 1)) {
ret = 0;
goto out;
}
ret = get_file_block(sb, alloc, wri, sfl,
GFB_INSERT | GFB_DIRTY, blk, &bl);
if (ret < 0)
goto out;
srb = bl->data;
scoutfs_inc_counter(sb, srch_compact_dirty_block);
}
if (sre_cmp(&root->sre, &srb->last) != 0) {
last_bytes = le32_to_cpu(srb->entry_bytes);
last_tail = srb->last;
ret = encode_entry(srb->entries +
le32_to_cpu(srb->entry_bytes),
&root->sre, &srb->tail);
if (WARN_ON_ONCE(ret <= 0)) {
/* shouldn't happen */
ret = -EIO;
goto out;
}
if (srb->entry_bytes == 0) {
if (blk == 0)
sfl->first = root->sre;
srb->first = root->sre;
}
le32_add_cpu(&srb->entry_nr, 1);
le32_add_cpu(&srb->entry_bytes, ret);
srb->last = root->sre;
srb->tail = root->sre;
sfl->last = root->sre;
le64_add_cpu(&sfl->entries, 1);
ret = 0;
if (le32_to_cpu(srb->entry_bytes) >
SCOUTFS_SRCH_BLOCK_SAFE_BYTES) {
scoutfs_block_put(sb, bl);
bl = NULL;
blk++;
}
/* end sorted block on _SAFE offset for testing */
if (bl && le32_to_cpu(srb->entry_nr) == 1 && logs_input &&
scoutfs_trigger(sb, SRCH_COMPACT_LOGS_PAD_SAFE)) {
pad_entries_at_safe(sfl, srb);
scoutfs_block_put(sb, bl);
bl = NULL;
blk++;
}
scoutfs_inc_counter(sb, srch_compact_entry);
tmp_entry = root->sre;
have_tmp = true;
} else {
/*
* Duplicate entries indicate deletion so we
* undo the previously encoded entry and ignore
* undo the previously cached tmp entry and ignore
* this entry. This only happens within each
* block. Deletions can span block boundaries
* and will be filtered out by search and
* hopefully removed in future compactions.
*/
diff = le32_to_cpu(srb->entry_bytes) - last_bytes;
if (diff) {
memset(srb->entries + last_bytes, 0, diff);
if (srb->entry_bytes == 0) {
/* last_tail will be 0 */
if (blk == 0)
sfl->first = last_tail;
srb->first = last_tail;
}
le32_add_cpu(&srb->entry_nr, -1);
srb->entry_bytes = cpu_to_le32(last_bytes);
srb->last = last_tail;
srb->tail = last_tail;
sfl->last = last_tail;
le64_add_cpu(&sfl->entries, -1);
}
trace_scoutfs_srch_clr_tmp(sb, &tmp_entry);
memset(&tmp_entry, 0, sizeof(tmp_entry));
have_tmp = false;
scoutfs_inc_counter(sb, srch_compact_removed_entry);
}
@@ -1739,6 +1773,22 @@ static int kway_merge(struct super_block *sb,
/* could stream a final index.. arguably a small portion of work */
out:
if (have_tmp) {
bool emit = true;
if (bl == NULL) {
ret = alloc_srch_block(sb, alloc, wri, sfl, &bl, blk);
if (ret) {
emit = false;
if (ret == -EAGAIN)
ret = 0;
}
}
if (emit)
ret = emit_srch_entry(sb, sfl, srb, &tmp_entry, blk);
}
scoutfs_block_put(sb, bl);
vfree(tnodes);
return ret;
@@ -1982,6 +2032,11 @@ static int kway_get_reader(struct super_block *sb,
rdr->skip > SCOUTFS_SRCH_BLOCK_SAFE_BYTES ||
rdr->skip >= le32_to_cpu(srb->entry_bytes)) {
/* XXX inconsistency */
scoutfs_err(sb, "blkno %llu pos %u vs %ld, skip %u, bytes %u",
__le64_to_cpu(srb->hdr.blkno),
rdr->pos, SCOUTFS_SRCH_BLOCK_SAFE_BYTES,
rdr->skip,
le32_to_cpu(srb->entry_bytes));
return -EIO;
}