From 63d9b7445f663ee52c9371fbf37c8b5ecde015fb Mon Sep 17 00:00:00 2001 From: Szymon Malewski Date: Thu, 14 May 2026 11:29:23 +0200 Subject: [PATCH] test/vector_search: migrate non-ANN similarity argument rescoring test to pytest Migrate select_similarity_function_other_than_ann_ordering from rescoring_test.cc to pytest. The test verifies that similarity scores in SELECT are computed against the explicitly supplied argument vector rather than the ANN ordering vector. No semantic change. --- .../test_vector_search_rescoring_with_mock.py | 24 ++++++++++++ test/vector_search/rescoring_test.cc | 39 ------------------- 2 files changed, 24 insertions(+), 39 deletions(-) diff --git a/test/cqlpy/test_vector_search_rescoring_with_mock.py b/test/cqlpy/test_vector_search_rescoring_with_mock.py index 721d5ca36d..18f3179683 100644 --- a/test/cqlpy/test_vector_search_rescoring_with_mock.py +++ b/test/cqlpy/test_vector_search_rescoring_with_mock.py @@ -212,3 +212,27 @@ def test_wildcard_select_is_correctly_rescored(cql, test_keyspace, vector_store_ for row, d_row in zip(rows, expected): assert list(row.embedding) == pytest.approx(d_row.embedding) assert len(rows[0]) == 2 + + +# Verifies that when the similarity function argument in SELECT differs from the +# ANN ordering vector, the correct similarity values are computed. Uses a +# prepared statement so the argument difference is only visible at execution time. +def test_select_similarity_function_other_than_ann_ordering(cql, test_keyspace, vector_store_mock, skip_without_tablets): + data = TEST_DATA["cosine"] + with rescoring_test_table(cql, test_keyspace, data) as table: + vector_store_mock.set_next_ann_response(200, reversed_ann_response(data)) + + prepared = cql.prepare( + f"SELECT id, similarity_cosine(embedding, ?) AS similarity FROM {table} " + f"ORDER BY embedding ANN OF ? LIMIT 2") + # Compute similarity to data[1].embedding while ordering by ANN_QUERY_VECTOR. + rows = list(cql.execute(prepared, [data[1].embedding, ANN_QUERY_VECTOR])) + + expected = data[:2] + assert [row.id for row in rows] == [d_row.id for d_row in expected] + # Similarity is computed against data[1].embedding, not the ANN query vector. + # id=1 (rescored rank 0): similarity(data[0].embedding, data[1].embedding) = 0.97. + # id=2 (rescored rank 1): similarity(data[1].embedding, data[1].embedding) = 1 (self-similarity). + assert rows[0].similarity == pytest.approx(0.97, abs=0.01) + assert rows[1].similarity == pytest.approx(1, abs=0.01) + assert len(rows[0]) == 2 diff --git a/test/vector_search/rescoring_test.cc b/test/vector_search/rescoring_test.cc index bccc77e460..7d35e662e6 100644 --- a/test/vector_search/rescoring_test.cc +++ b/test/vector_search/rescoring_test.cc @@ -107,45 +107,6 @@ struct print_log_value> { }; } -SEASTAR_TEST_CASE(select_similarity_function_other_than_ann_ordering) { - // Another tricky case with similarity column with argument different from ANN ordering vector. - // Especially if we use prepared statement and the difference is only seen at execution time. - const auto& params = test_data[0]; - auto server = co_await make_vs_mock_server(); - co_await do_with_cql_env( - [&](cql_test_env& env) -> future<> { - configure(env.local_qp().vector_store_client()).with_dns({{"server.node", std::vector{server->host()}}}); - env.local_qp().vector_store_client().start_background_tasks(); - co_await create_index_and_insert_data(env, params); - - // Mock Response: Return all keys but in REVERSE similarity order. - server->next_ann_response({http::reply::status_type::ok, R"({ - "primary_keys": { "id": [4, 3, 2, 1] }, - "similarity_scores": [0, 0, 0, 0] - })"}); - auto prep = co_await env.prepare(fmt::format( - "SELECT id, similarity_{}(embedding, ?) FROM ks.cf ORDER BY embedding ANN OF ? LIMIT 2;", params.function_name)); - auto msg = co_await env.execute_prepared(prep, cql3::raw_value_vector_with_unset{ - cql3::raw_value::make_value(to_embedding(params.vectors[1])), - cql3::raw_value::make_value(to_embedding({0.1f, 0.1f}))}); - - auto rms = dynamic_pointer_cast(msg); - BOOST_REQUIRE(rms); - const auto& rows = rms->rs().result_set().rows(); - BOOST_REQUIRE(rows.size() >= 2); - BOOST_CHECK_EQUAL(rows.size(), 2); - BOOST_CHECK_EQUAL(rms->rs().result_set().get_metadata().column_count(), 2); - BOOST_CHECK_EQUAL(get_id_col_value(rows.at(0)), 1); - BOOST_CHECK(is_similarity_eq(get_similarity_col_value(rows.at(0)), params.expected_similarity[1])); - BOOST_CHECK_EQUAL(get_id_col_value(rows.at(1)), 2); - BOOST_CHECK(is_similarity_eq(get_similarity_col_value(rows.at(1)), params.expected_similarity[0])); - }, - make_config(format("http://server.node:{}", server->port()))) - .finally(seastar::coroutine::lambda([&] -> future<> { - co_await server->stop(); - })); -} - // Rescoring does not filter out NULL embeddings yet, but they should be sorted as last. // So this test is expected to report error on result set size, but passes if the first element is correct. SEASTAR_TEST_CASE(no_nulls_in_rescored_results, *boost::unit_test::expected_failures(3)) {