mirror of
https://github.com/scylladb/scylladb.git
synced 2026-05-30 03:30:49 +00:00
test/vector_search: migrate non-ANN similarity argument rescoring test to pytest
Migrate select_similarity_function_other_than_ann_ordering from rescoring_test.cc to pytest. The test verifies that similarity scores in SELECT are computed against the explicitly supplied argument vector rather than the ANN ordering vector. No semantic change.
This commit is contained in:
@@ -212,3 +212,27 @@ def test_wildcard_select_is_correctly_rescored(cql, test_keyspace, vector_store_
|
||||
for row, d_row in zip(rows, expected):
|
||||
assert list(row.embedding) == pytest.approx(d_row.embedding)
|
||||
assert len(rows[0]) == 2
|
||||
|
||||
|
||||
# Verifies that when the similarity function argument in SELECT differs from the
|
||||
# ANN ordering vector, the correct similarity values are computed. Uses a
|
||||
# prepared statement so the argument difference is only visible at execution time.
|
||||
def test_select_similarity_function_other_than_ann_ordering(cql, test_keyspace, vector_store_mock, skip_without_tablets):
|
||||
data = TEST_DATA["cosine"]
|
||||
with rescoring_test_table(cql, test_keyspace, data) as table:
|
||||
vector_store_mock.set_next_ann_response(200, reversed_ann_response(data))
|
||||
|
||||
prepared = cql.prepare(
|
||||
f"SELECT id, similarity_cosine(embedding, ?) AS similarity FROM {table} "
|
||||
f"ORDER BY embedding ANN OF ? LIMIT 2")
|
||||
# Compute similarity to data[1].embedding while ordering by ANN_QUERY_VECTOR.
|
||||
rows = list(cql.execute(prepared, [data[1].embedding, ANN_QUERY_VECTOR]))
|
||||
|
||||
expected = data[:2]
|
||||
assert [row.id for row in rows] == [d_row.id for d_row in expected]
|
||||
# Similarity is computed against data[1].embedding, not the ANN query vector.
|
||||
# id=1 (rescored rank 0): similarity(data[0].embedding, data[1].embedding) = 0.97.
|
||||
# id=2 (rescored rank 1): similarity(data[1].embedding, data[1].embedding) = 1 (self-similarity).
|
||||
assert rows[0].similarity == pytest.approx(0.97, abs=0.01)
|
||||
assert rows[1].similarity == pytest.approx(1, abs=0.01)
|
||||
assert len(rows[0]) == 2
|
||||
|
||||
@@ -107,45 +107,6 @@ struct print_log_value<std::vector<float>> {
|
||||
};
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(select_similarity_function_other_than_ann_ordering) {
|
||||
// Another tricky case with similarity column with argument different from ANN ordering vector.
|
||||
// Especially if we use prepared statement and the difference is only seen at execution time.
|
||||
const auto& params = test_data[0];
|
||||
auto server = co_await make_vs_mock_server();
|
||||
co_await do_with_cql_env(
|
||||
[&](cql_test_env& env) -> future<> {
|
||||
configure(env.local_qp().vector_store_client()).with_dns({{"server.node", std::vector<std::string>{server->host()}}});
|
||||
env.local_qp().vector_store_client().start_background_tasks();
|
||||
co_await create_index_and_insert_data(env, params);
|
||||
|
||||
// Mock Response: Return all keys but in REVERSE similarity order.
|
||||
server->next_ann_response({http::reply::status_type::ok, R"({
|
||||
"primary_keys": { "id": [4, 3, 2, 1] },
|
||||
"similarity_scores": [0, 0, 0, 0]
|
||||
})"});
|
||||
auto prep = co_await env.prepare(fmt::format(
|
||||
"SELECT id, similarity_{}(embedding, ?) FROM ks.cf ORDER BY embedding ANN OF ? LIMIT 2;", params.function_name));
|
||||
auto msg = co_await env.execute_prepared(prep, cql3::raw_value_vector_with_unset{
|
||||
cql3::raw_value::make_value(to_embedding(params.vectors[1])),
|
||||
cql3::raw_value::make_value(to_embedding({0.1f, 0.1f}))});
|
||||
|
||||
auto rms = dynamic_pointer_cast<cql_transport::messages::result_message::rows>(msg);
|
||||
BOOST_REQUIRE(rms);
|
||||
const auto& rows = rms->rs().result_set().rows();
|
||||
BOOST_REQUIRE(rows.size() >= 2);
|
||||
BOOST_CHECK_EQUAL(rows.size(), 2);
|
||||
BOOST_CHECK_EQUAL(rms->rs().result_set().get_metadata().column_count(), 2);
|
||||
BOOST_CHECK_EQUAL(get_id_col_value(rows.at(0)), 1);
|
||||
BOOST_CHECK(is_similarity_eq(get_similarity_col_value(rows.at(0)), params.expected_similarity[1]));
|
||||
BOOST_CHECK_EQUAL(get_id_col_value(rows.at(1)), 2);
|
||||
BOOST_CHECK(is_similarity_eq(get_similarity_col_value(rows.at(1)), params.expected_similarity[0]));
|
||||
},
|
||||
make_config(format("http://server.node:{}", server->port())))
|
||||
.finally(seastar::coroutine::lambda([&] -> future<> {
|
||||
co_await server->stop();
|
||||
}));
|
||||
}
|
||||
|
||||
// Rescoring does not filter out NULL embeddings yet, but they should be sorted as last.
|
||||
// So this test is expected to report error on result set size, but passes if the first element is correct.
|
||||
SEASTAR_TEST_CASE(no_nulls_in_rescored_results, *boost::unit_test::expected_failures(3)) {
|
||||
|
||||
Reference in New Issue
Block a user