From 8fe0b68b7c5fdcc4b1ee7d63b0cbca7a529261ff Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Sat, 25 Apr 2026 15:09:15 -0700 Subject: [PATCH] parquet_pushdown: distinguish "no score" from Score == 0 in ScoredRowRef A bare float32 score conflates "this row has no score, it came from a scalar predicate" with "this row's vector similarity score is 0.0", which is a legal value: zero L2 distance and zero dot product both arise in real queries. Mixing scored and unscored results in one response (e.g. hybrid scalar+vector) becomes ambiguous. Add has_score (proto field 2, sandwiched between Ref and Score) and the corresponding HasScore field in the design's Go sketch. Update the prose paragraph to call out the new flag's purpose. --- PARQUET_PUSHDOWN_DESIGN.md | 16 +++++++----- weed/pb/parquet_pushdown.proto | 7 +++++- .../parquet_pushdown.pb.go | 25 ++++++++++++++----- 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/PARQUET_PUSHDOWN_DESIGN.md b/PARQUET_PUSHDOWN_DESIGN.md index f1999cf9a..7dec3e7bb 100644 --- a/PARQUET_PUSHDOWN_DESIGN.md +++ b/PARQUET_PUSHDOWN_DESIGN.md @@ -690,18 +690,22 @@ type RowRef struct { FilePosition int64 // 0-based row index within the file (file-absolute) } -// ScoredRowRef pairs a row reference with its similarity score. Used -// for vector-search results so score order is unambiguous; non-vector -// queries leave Score zero. +// ScoredRowRef pairs a row reference with an optional similarity +// score. HasScore distinguishes "no score available" from "score is +// 0.0", which is a legal vector-similarity value (a perfect cosine +// match has Score == 1, but L2 distance and dot products commonly +// produce 0.0 for nontrivial inputs). Non-vector queries leave +// HasScore false; vector queries always set it true. type ScoredRowRef struct { - Ref RowRef - Score float32 + Ref RowRef + HasScore bool + Score float32 } ``` Row identity uses **file-absolute** position (matching Iceberg position-delete files), not row-group-local. Row-group-local indexing is exposed via the convenience `RowGroup` field but is not authoritative — clients converting a `RowRef` back to a Parquet read should locate the row by `FilePosition` against the parsed footer's row-group boundaries. -`Scores` is no longer a parallel array. Pairing each score with its row ref via `ScoredRowRef` removes the ordering constraint and lets a single response mix scored (vector) and unscored (scalar) results without ambiguity. +`Scores` is no longer a parallel array. Pairing each score with its row ref via `ScoredRowRef` removes the ordering constraint, and the explicit `HasScore` flag lets a single response mix scored (vector) and unscored (scalar) results without aliasing valid zero-valued scores onto "no score". ## Connector Behavior diff --git a/weed/pb/parquet_pushdown.proto b/weed/pb/parquet_pushdown.proto index 4a847fa75..d22d1b6a0 100644 --- a/weed/pb/parquet_pushdown.proto +++ b/weed/pb/parquet_pushdown.proto @@ -220,7 +220,12 @@ message RowRef { message ScoredRowRef { RowRef ref = 1; - float score = 2; + // has_score distinguishes "no score available" from "score is + // 0.0", which is a legal vector-similarity value (zero L2 + // distance, zero dot product). Non-vector queries leave + // has_score false; vector queries always set it true. + bool has_score = 2; + float score = 3; } message PushdownStats { diff --git a/weed/pb/parquet_pushdown_pb/parquet_pushdown.pb.go b/weed/pb/parquet_pushdown_pb/parquet_pushdown.pb.go index 7cda3d0f0..8dc92ff23 100644 --- a/weed/pb/parquet_pushdown_pb/parquet_pushdown.pb.go +++ b/weed/pb/parquet_pushdown_pb/parquet_pushdown.pb.go @@ -1189,9 +1189,14 @@ func (x *RowRef) GetFilePosition() int64 { } type ScoredRowRef struct { - state protoimpl.MessageState `protogen:"open.v1"` - Ref *RowRef `protobuf:"bytes,1,opt,name=ref,proto3" json:"ref,omitempty"` - Score float32 `protobuf:"fixed32,2,opt,name=score,proto3" json:"score,omitempty"` + state protoimpl.MessageState `protogen:"open.v1"` + Ref *RowRef `protobuf:"bytes,1,opt,name=ref,proto3" json:"ref,omitempty"` + // has_score distinguishes "no score available" from "score is + // 0.0", which is a legal vector-similarity value (zero L2 + // distance, zero dot product). Non-vector queries leave + // has_score false; vector queries always set it true. + HasScore bool `protobuf:"varint,2,opt,name=has_score,json=hasScore,proto3" json:"has_score,omitempty"` + Score float32 `protobuf:"fixed32,3,opt,name=score,proto3" json:"score,omitempty"` unknownFields protoimpl.UnknownFields sizeCache protoimpl.SizeCache } @@ -1233,6 +1238,13 @@ func (x *ScoredRowRef) GetRef() *RowRef { return nil } +func (x *ScoredRowRef) GetHasScore() bool { + if x != nil { + return x.HasScore + } + return false +} + func (x *ScoredRowRef) GetScore() float32 { if x != nil { return x.Score @@ -1444,10 +1456,11 @@ const file_parquet_pushdown_proto_rawDesc = "" + "\x06RowRef\x12\x12\n" + "\x04file\x18\x01 \x01(\tR\x04file\x12\x1b\n" + "\trow_group\x18\x02 \x01(\x05R\browGroup\x12#\n" + - "\rfile_position\x18\x03 \x01(\x03R\ffilePosition\"S\n" + + "\rfile_position\x18\x03 \x01(\x03R\ffilePosition\"p\n" + "\fScoredRowRef\x12-\n" + - "\x03ref\x18\x01 \x01(\v2\x1b.parquet_pushdown_pb.RowRefR\x03ref\x12\x14\n" + - "\x05score\x18\x02 \x01(\x02R\x05score\"\x81\x03\n" + + "\x03ref\x18\x01 \x01(\v2\x1b.parquet_pushdown_pb.RowRefR\x03ref\x12\x1b\n" + + "\thas_score\x18\x02 \x01(\bR\bhasScore\x12\x14\n" + + "\x05score\x18\x03 \x01(\x02R\x05score\"\x81\x03\n" + "\rPushdownStats\x12\x1d\n" + "\n" + "trust_mode\x18\x01 \x01(\tR\ttrustMode\x12,\n" +