mirror of
https://github.com/seaweedfs/seaweedfs.git
synced 2026-05-25 11:10:20 +00:00
parquet_pushdown: distinguish "no score" from Score == 0 in ScoredRowRef
A bare float32 score conflates "this row has no score, it came from a scalar predicate" with "this row's vector similarity score is 0.0", which is a legal value: zero L2 distance and zero dot product both arise in real queries. Mixing scored and unscored results in one response (e.g. hybrid scalar+vector) becomes ambiguous. Add has_score (proto field 2, sandwiched between Ref and Score) and the corresponding HasScore field in the design's Go sketch. Update the prose paragraph to call out the new flag's purpose.
This commit is contained in:
@@ -690,18 +690,22 @@ type RowRef struct {
|
||||
FilePosition int64 // 0-based row index within the file (file-absolute)
|
||||
}
|
||||
|
||||
// ScoredRowRef pairs a row reference with its similarity score. Used
|
||||
// for vector-search results so score order is unambiguous; non-vector
|
||||
// queries leave Score zero.
|
||||
// ScoredRowRef pairs a row reference with an optional similarity
|
||||
// score. HasScore distinguishes "no score available" from "score is
|
||||
// 0.0", which is a legal vector-similarity value (a perfect cosine
|
||||
// match has Score == 1, but L2 distance and dot products commonly
|
||||
// produce 0.0 for nontrivial inputs). Non-vector queries leave
|
||||
// HasScore false; vector queries always set it true.
|
||||
type ScoredRowRef struct {
|
||||
Ref RowRef
|
||||
Score float32
|
||||
Ref RowRef
|
||||
HasScore bool
|
||||
Score float32
|
||||
}
|
||||
```
|
||||
|
||||
Row identity uses **file-absolute** position (matching Iceberg position-delete files), not row-group-local. Row-group-local indexing is exposed via the convenience `RowGroup` field but is not authoritative — clients converting a `RowRef` back to a Parquet read should locate the row by `FilePosition` against the parsed footer's row-group boundaries.
|
||||
|
||||
`Scores` is no longer a parallel array. Pairing each score with its row ref via `ScoredRowRef` removes the ordering constraint and lets a single response mix scored (vector) and unscored (scalar) results without ambiguity.
|
||||
`Scores` is no longer a parallel array. Pairing each score with its row ref via `ScoredRowRef` removes the ordering constraint, and the explicit `HasScore` flag lets a single response mix scored (vector) and unscored (scalar) results without aliasing valid zero-valued scores onto "no score".
|
||||
|
||||
## Connector Behavior
|
||||
|
||||
|
||||
@@ -220,7 +220,12 @@ message RowRef {
|
||||
|
||||
message ScoredRowRef {
|
||||
RowRef ref = 1;
|
||||
float score = 2;
|
||||
// has_score distinguishes "no score available" from "score is
|
||||
// 0.0", which is a legal vector-similarity value (zero L2
|
||||
// distance, zero dot product). Non-vector queries leave
|
||||
// has_score false; vector queries always set it true.
|
||||
bool has_score = 2;
|
||||
float score = 3;
|
||||
}
|
||||
|
||||
message PushdownStats {
|
||||
|
||||
@@ -1189,9 +1189,14 @@ func (x *RowRef) GetFilePosition() int64 {
|
||||
}
|
||||
|
||||
type ScoredRowRef struct {
|
||||
state protoimpl.MessageState `protogen:"open.v1"`
|
||||
Ref *RowRef `protobuf:"bytes,1,opt,name=ref,proto3" json:"ref,omitempty"`
|
||||
Score float32 `protobuf:"fixed32,2,opt,name=score,proto3" json:"score,omitempty"`
|
||||
state protoimpl.MessageState `protogen:"open.v1"`
|
||||
Ref *RowRef `protobuf:"bytes,1,opt,name=ref,proto3" json:"ref,omitempty"`
|
||||
// has_score distinguishes "no score available" from "score is
|
||||
// 0.0", which is a legal vector-similarity value (zero L2
|
||||
// distance, zero dot product). Non-vector queries leave
|
||||
// has_score false; vector queries always set it true.
|
||||
HasScore bool `protobuf:"varint,2,opt,name=has_score,json=hasScore,proto3" json:"has_score,omitempty"`
|
||||
Score float32 `protobuf:"fixed32,3,opt,name=score,proto3" json:"score,omitempty"`
|
||||
unknownFields protoimpl.UnknownFields
|
||||
sizeCache protoimpl.SizeCache
|
||||
}
|
||||
@@ -1233,6 +1238,13 @@ func (x *ScoredRowRef) GetRef() *RowRef {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (x *ScoredRowRef) GetHasScore() bool {
|
||||
if x != nil {
|
||||
return x.HasScore
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (x *ScoredRowRef) GetScore() float32 {
|
||||
if x != nil {
|
||||
return x.Score
|
||||
@@ -1444,10 +1456,11 @@ const file_parquet_pushdown_proto_rawDesc = "" +
|
||||
"\x06RowRef\x12\x12\n" +
|
||||
"\x04file\x18\x01 \x01(\tR\x04file\x12\x1b\n" +
|
||||
"\trow_group\x18\x02 \x01(\x05R\browGroup\x12#\n" +
|
||||
"\rfile_position\x18\x03 \x01(\x03R\ffilePosition\"S\n" +
|
||||
"\rfile_position\x18\x03 \x01(\x03R\ffilePosition\"p\n" +
|
||||
"\fScoredRowRef\x12-\n" +
|
||||
"\x03ref\x18\x01 \x01(\v2\x1b.parquet_pushdown_pb.RowRefR\x03ref\x12\x14\n" +
|
||||
"\x05score\x18\x02 \x01(\x02R\x05score\"\x81\x03\n" +
|
||||
"\x03ref\x18\x01 \x01(\v2\x1b.parquet_pushdown_pb.RowRefR\x03ref\x12\x1b\n" +
|
||||
"\thas_score\x18\x02 \x01(\bR\bhasScore\x12\x14\n" +
|
||||
"\x05score\x18\x03 \x01(\x02R\x05score\"\x81\x03\n" +
|
||||
"\rPushdownStats\x12\x1d\n" +
|
||||
"\n" +
|
||||
"trust_mode\x18\x01 \x01(\tR\ttrustMode\x12,\n" +
|
||||
|
||||
Reference in New Issue
Block a user