parquet_pushdown: distinguish "no score" from Score == 0 in ScoredRowRef

A bare float32 score conflates "this row has no score, it came from
a scalar predicate" with "this row's vector similarity score is
0.0", which is a legal value: zero L2 distance and zero dot product
both arise in real queries. Mixing scored and unscored results in
one response (e.g. hybrid scalar+vector) becomes ambiguous.

Add has_score (proto field 2, sandwiched between Ref and Score) and
the corresponding HasScore field in the design's Go sketch. Update
the prose paragraph to call out the new flag's purpose.
This commit is contained in:
Chris Lu
2026-04-25 15:09:15 -07:00
parent 6c826e710a
commit 8fe0b68b7c
3 changed files with 35 additions and 13 deletions

View File

@@ -690,18 +690,22 @@ type RowRef struct {
FilePosition int64 // 0-based row index within the file (file-absolute)
}
// ScoredRowRef pairs a row reference with its similarity score. Used
// for vector-search results so score order is unambiguous; non-vector
// queries leave Score zero.
// ScoredRowRef pairs a row reference with an optional similarity
// score. HasScore distinguishes "no score available" from "score is
// 0.0", which is a legal vector-similarity value (a perfect cosine
// match has Score == 1, but L2 distance and dot products commonly
// produce 0.0 for nontrivial inputs). Non-vector queries leave
// HasScore false; vector queries always set it true.
type ScoredRowRef struct {
Ref RowRef
Score float32
Ref RowRef
HasScore bool
Score float32
}
```
Row identity uses **file-absolute** position (matching Iceberg position-delete files), not row-group-local. Row-group-local indexing is exposed via the convenience `RowGroup` field but is not authoritative — clients converting a `RowRef` back to a Parquet read should locate the row by `FilePosition` against the parsed footer's row-group boundaries.
`Scores` is no longer a parallel array. Pairing each score with its row ref via `ScoredRowRef` removes the ordering constraint and lets a single response mix scored (vector) and unscored (scalar) results without ambiguity.
`Scores` is no longer a parallel array. Pairing each score with its row ref via `ScoredRowRef` removes the ordering constraint, and the explicit `HasScore` flag lets a single response mix scored (vector) and unscored (scalar) results without aliasing valid zero-valued scores onto "no score".
## Connector Behavior

View File

@@ -220,7 +220,12 @@ message RowRef {
message ScoredRowRef {
RowRef ref = 1;
float score = 2;
// has_score distinguishes "no score available" from "score is
// 0.0", which is a legal vector-similarity value (zero L2
// distance, zero dot product). Non-vector queries leave
// has_score false; vector queries always set it true.
bool has_score = 2;
float score = 3;
}
message PushdownStats {

View File

@@ -1189,9 +1189,14 @@ func (x *RowRef) GetFilePosition() int64 {
}
type ScoredRowRef struct {
state protoimpl.MessageState `protogen:"open.v1"`
Ref *RowRef `protobuf:"bytes,1,opt,name=ref,proto3" json:"ref,omitempty"`
Score float32 `protobuf:"fixed32,2,opt,name=score,proto3" json:"score,omitempty"`
state protoimpl.MessageState `protogen:"open.v1"`
Ref *RowRef `protobuf:"bytes,1,opt,name=ref,proto3" json:"ref,omitempty"`
// has_score distinguishes "no score available" from "score is
// 0.0", which is a legal vector-similarity value (zero L2
// distance, zero dot product). Non-vector queries leave
// has_score false; vector queries always set it true.
HasScore bool `protobuf:"varint,2,opt,name=has_score,json=hasScore,proto3" json:"has_score,omitempty"`
Score float32 `protobuf:"fixed32,3,opt,name=score,proto3" json:"score,omitempty"`
unknownFields protoimpl.UnknownFields
sizeCache protoimpl.SizeCache
}
@@ -1233,6 +1238,13 @@ func (x *ScoredRowRef) GetRef() *RowRef {
return nil
}
func (x *ScoredRowRef) GetHasScore() bool {
if x != nil {
return x.HasScore
}
return false
}
func (x *ScoredRowRef) GetScore() float32 {
if x != nil {
return x.Score
@@ -1444,10 +1456,11 @@ const file_parquet_pushdown_proto_rawDesc = "" +
"\x06RowRef\x12\x12\n" +
"\x04file\x18\x01 \x01(\tR\x04file\x12\x1b\n" +
"\trow_group\x18\x02 \x01(\x05R\browGroup\x12#\n" +
"\rfile_position\x18\x03 \x01(\x03R\ffilePosition\"S\n" +
"\rfile_position\x18\x03 \x01(\x03R\ffilePosition\"p\n" +
"\fScoredRowRef\x12-\n" +
"\x03ref\x18\x01 \x01(\v2\x1b.parquet_pushdown_pb.RowRefR\x03ref\x12\x14\n" +
"\x05score\x18\x02 \x01(\x02R\x05score\"\x81\x03\n" +
"\x03ref\x18\x01 \x01(\v2\x1b.parquet_pushdown_pb.RowRefR\x03ref\x12\x1b\n" +
"\thas_score\x18\x02 \x01(\bR\bhasScore\x12\x14\n" +
"\x05score\x18\x03 \x01(\x02R\x05score\"\x81\x03\n" +
"\rPushdownStats\x12\x1d\n" +
"\n" +
"trust_mode\x18\x01 \x01(\tR\ttrustMode\x12,\n" +