From cd6831392903e453944dfeb005be6571c85fd4d2 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Wed, 27 May 2026 10:23:29 -0700 Subject: [PATCH] fix(filer.sync): resolve manifest chunks against source filer (#9705) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(filer.sync): resolve manifest chunks against source filer `UpdateEntry` was passing `filer.LookupFn(fs)` — the sink filer client — into `compareChunks`. But `oldEntry`/`newEntry` chunks come from the source cluster, so manifest resolution must hit the source filer's volume servers. With two clusters that have overlapping volume IDs (common once they grow past a few hundred volumes), the sink lookup returns its own volume's URLs and the fetch 404s on the source's fileKey: compare chunks error: fail to read manifest 631,0babe...: 404 Not Found The 404 aborts the diff, the manifest chunk never gets replicated, and the target ends up with whatever flat chunks happened to land from earlier partial syncs — visible as `SIZE_MISMATCH` in filer.sync.verify on files large enough to use chunk manifests (~150 GB+ in practice). Only the manifest path was wrong; flat-chunk reads in `fetchAndWrite` already use `fs.filerSource.ReadPart`. * trim comment * test(filer.sync): regression test for source-filer manifest lookup Two recording filer gRPC servers stand in for source and sink. Driving UpdateEntry with a manifest chunk and observing which one receives LookupVolume proves compareChunks routes source-side lookups through fs.filerSource, not fs. Reverting the fix flips the call onto the sink filer and fails the assertion. * drop test --- weed/replication/sink/filersink/filer_sink.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/weed/replication/sink/filersink/filer_sink.go b/weed/replication/sink/filersink/filer_sink.go index 73ad57e1e..93b99ec75 100644 --- a/weed/replication/sink/filersink/filer_sink.go +++ b/weed/replication/sink/filersink/filer_sink.go @@ -265,8 +265,8 @@ func (fs *FilerSink) UpdateEntry(key string, oldEntry *filer_pb.Entry, newParent // this usually happens when the messages are not ordered glog.V(2).Infof("late updates %s", key) } else { - // find out what changed - deletedChunks, newChunks, err := compareChunks(context.Background(), filer.LookupFn(fs), oldEntry, newEntry) + // source-side chunks resolve via source filer; sink volume IDs may collide. + deletedChunks, newChunks, err := compareChunks(context.Background(), filer.LookupFn(fs.filerSource), oldEntry, newEntry) if err != nil { return true, fmt.Errorf("replicate %s compare chunks error: %v", key, err) }