test/vector_search: migrate non-ANN similarity argument rescoring test to pytest

Migrate select_similarity_function_other_than_ann_ordering from
rescoring_test.cc to pytest. The test verifies that similarity scores in
SELECT are computed against the explicitly supplied argument vector rather
than the ANN ordering vector. No semantic change.
This commit is contained in:
Szymon Malewski
2026-05-14 11:29:23 +02:00
parent 0cb557695a
commit 63d9b7445f
2 changed files with 24 additions and 39 deletions

View File

@@ -212,3 +212,27 @@ def test_wildcard_select_is_correctly_rescored(cql, test_keyspace, vector_store_
for row, d_row in zip(rows, expected):
assert list(row.embedding) == pytest.approx(d_row.embedding)
assert len(rows[0]) == 2
# Verifies that when the similarity function argument in SELECT differs from the
# ANN ordering vector, the correct similarity values are computed. Uses a
# prepared statement so the argument difference is only visible at execution time.
def test_select_similarity_function_other_than_ann_ordering(cql, test_keyspace, vector_store_mock, skip_without_tablets):
data = TEST_DATA["cosine"]
with rescoring_test_table(cql, test_keyspace, data) as table:
vector_store_mock.set_next_ann_response(200, reversed_ann_response(data))
prepared = cql.prepare(
f"SELECT id, similarity_cosine(embedding, ?) AS similarity FROM {table} "
f"ORDER BY embedding ANN OF ? LIMIT 2")
# Compute similarity to data[1].embedding while ordering by ANN_QUERY_VECTOR.
rows = list(cql.execute(prepared, [data[1].embedding, ANN_QUERY_VECTOR]))
expected = data[:2]
assert [row.id for row in rows] == [d_row.id for d_row in expected]
# Similarity is computed against data[1].embedding, not the ANN query vector.
# id=1 (rescored rank 0): similarity(data[0].embedding, data[1].embedding) = 0.97.
# id=2 (rescored rank 1): similarity(data[1].embedding, data[1].embedding) = 1 (self-similarity).
assert rows[0].similarity == pytest.approx(0.97, abs=0.01)
assert rows[1].similarity == pytest.approx(1, abs=0.01)
assert len(rows[0]) == 2

View File

@@ -107,45 +107,6 @@ struct print_log_value<std::vector<float>> {
};
}
SEASTAR_TEST_CASE(select_similarity_function_other_than_ann_ordering) {
// Another tricky case with similarity column with argument different from ANN ordering vector.
// Especially if we use prepared statement and the difference is only seen at execution time.
const auto& params = test_data[0];
auto server = co_await make_vs_mock_server();
co_await do_with_cql_env(
[&](cql_test_env& env) -> future<> {
configure(env.local_qp().vector_store_client()).with_dns({{"server.node", std::vector<std::string>{server->host()}}});
env.local_qp().vector_store_client().start_background_tasks();
co_await create_index_and_insert_data(env, params);
// Mock Response: Return all keys but in REVERSE similarity order.
server->next_ann_response({http::reply::status_type::ok, R"({
"primary_keys": { "id": [4, 3, 2, 1] },
"similarity_scores": [0, 0, 0, 0]
})"});
auto prep = co_await env.prepare(fmt::format(
"SELECT id, similarity_{}(embedding, ?) FROM ks.cf ORDER BY embedding ANN OF ? LIMIT 2;", params.function_name));
auto msg = co_await env.execute_prepared(prep, cql3::raw_value_vector_with_unset{
cql3::raw_value::make_value(to_embedding(params.vectors[1])),
cql3::raw_value::make_value(to_embedding({0.1f, 0.1f}))});
auto rms = dynamic_pointer_cast<cql_transport::messages::result_message::rows>(msg);
BOOST_REQUIRE(rms);
const auto& rows = rms->rs().result_set().rows();
BOOST_REQUIRE(rows.size() >= 2);
BOOST_CHECK_EQUAL(rows.size(), 2);
BOOST_CHECK_EQUAL(rms->rs().result_set().get_metadata().column_count(), 2);
BOOST_CHECK_EQUAL(get_id_col_value(rows.at(0)), 1);
BOOST_CHECK(is_similarity_eq(get_similarity_col_value(rows.at(0)), params.expected_similarity[1]));
BOOST_CHECK_EQUAL(get_id_col_value(rows.at(1)), 2);
BOOST_CHECK(is_similarity_eq(get_similarity_col_value(rows.at(1)), params.expected_similarity[0]));
},
make_config(format("http://server.node:{}", server->port())))
.finally(seastar::coroutine::lambda([&] -> future<> {
co_await server->stop();
}));
}
// Rescoring does not filter out NULL embeddings yet, but they should be sorted as last.
// So this test is expected to report error on result set size, but passes if the first element is correct.
SEASTAR_TEST_CASE(no_nulls_in_rescored_results, *boost::unit_test::expected_failures(3)) {