mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-23 01:50:35 +00:00
Analysis of customer stalls revealed that the function `detail::hash_with_salt` (invoked by `passwords::check`) often blocks the reactor. Internally, this function uses the external `crypt_r` function to compute password hashes, which is CPU-intensive.
This PR addresses the issue in two ways:
1) `sha-512` is now the only password hashing scheme for new passwords (it was already the common-case).
2) `passwords::check` is moved to a dedicated alien thread.
Regarding point 1: before this change, the following hashing schemes were supported by `identify_best_supported_scheme()`: bcrypt_y, bcrypt_a, SHA-512, SHA-256, and MD5. The reason for this was that the `crypt_r` function used for password hashing comes from an external library (currently `libxcrypt`), and the supported hashing algorithms vary depending on the library in use. However:
- The bcrypt schemes never worked properly because their prefixes lack the required round count (e.g. `$2y$` instead of `$2y$05$`). Moreover, bcrypt is slower than SHA-512, so it not good idea to fix or use it.
- SHA-256 and SHA-512 both belong to the SHA-2 family. Libraries that support one almost always support the other, so it’s very unlikely to find SHA-256 without SHA-512.
- MD5 is no longer considered secure for password hashing.
Regarding point 2: the `passwords::check` call now runs on a shared alien thread created at database startup. An `std::mutex` synchronizes that thread with the shards. In theory this could introduce a frequent lock contention, but in practice each shard handles only a few hundred new connections per second—even during storms. There is already `_conns_cpu_concurrency_semaphore` in `generic_server` limits the number of concurrent connection handlers.
Fixes https://github.com/scylladb/scylladb/issues/24524
Backport not needed, as it is a new feature.
Closes scylladb/scylladb#24924
* github.com:scylladb/scylladb:
main: utils: add thread names to alien workers
auth: move passwords::check call to alien thread
test: wait for 3 clients with given username in test_service_level_api
auth: refactor password checking in password_authenticator
auth: make SHA-512 the only password hashing scheme for new passwords
auth: whitespace change in identify_best_supported_scheme()
auth: require scheme as parameter for `generate_salt`
auth: check password hashing scheme support on authenticator start
(cherry picked from commit c762425ea7)
154 lines
6.3 KiB
Python
154 lines
6.3 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright 2024-present ScyllaDB
|
|
#
|
|
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
|
|
|
|
########################################
|
|
# Tests for the service levels HTTP API.
|
|
########################################
|
|
|
|
import pytest
|
|
from .rest_api import get_request, post_request
|
|
from .util import new_session, unique_name
|
|
import time
|
|
|
|
# All tests in this file check the Scylla-only service levels feature,
|
|
# so let's mark them all scylla_only with an autouse fixture:
|
|
@pytest.fixture(scope="function", autouse=True)
|
|
def all_tests_are_scylla_only(scylla_only):
|
|
pass
|
|
|
|
def get_shard_count(cql):
|
|
return cql.execute("SELECT shard_count FROM system.topology").one().shard_count
|
|
|
|
def read_barrier(cql):
|
|
cql.execute("DROP TABLE IF EXISTS nosuchkeyspace.nosuchtable")
|
|
|
|
def count_opened_connections(cql, retry_unauthenticated=True):
|
|
response = get_request(cql, "service_levels/count_connections")
|
|
return response
|
|
|
|
def switch_tenants(cql):
|
|
return post_request(cql, "service_levels/switch_tenants")
|
|
|
|
def count_opened_connections_from_table(cql):
|
|
connections = cql.execute("SELECT username, scheduling_group FROM system.clients WHERE client_type='cql' ALLOW FILTERING")
|
|
result = {}
|
|
for row in connections:
|
|
user = row[0]
|
|
shg = row[1]
|
|
|
|
if shg in result:
|
|
if user in result[shg]:
|
|
result[shg][user] += 1
|
|
else:
|
|
result[shg][user] = 1
|
|
else:
|
|
result[shg] = {user: 1}
|
|
|
|
return result
|
|
|
|
def wait_for_clients(cql, username, clients_num, wait_s = 1, timeout_s = 30):
|
|
start_time = time.time()
|
|
while time.time() - start_time < timeout_s:
|
|
result = cql.execute(f"SELECT COUNT(*) FROM system.clients WHERE username='{username}' ALLOW FILTERING")
|
|
if result.one()[0] == clients_num:
|
|
return
|
|
else:
|
|
time.sleep(wait_s)
|
|
|
|
raise RuntimeError(f"Awaiting for {clients_num} clients timed out.")
|
|
|
|
def wait_until_all_connections_authenticated(cql, wait_s = 1, timeout_s = 30):
|
|
start_time = time.time()
|
|
while time.time() - start_time < timeout_s:
|
|
result = cql.execute("SELECT COUNT(*) FROM system.clients WHERE username='anonymous' ALLOW FILTERING")
|
|
if result.one()[0] == 0:
|
|
return
|
|
else:
|
|
time.sleep(wait_s)
|
|
|
|
raise RuntimeError(f"Awaiting for connections authentication timed out.")
|
|
|
|
# The driver creates 1 connection per shard plus 1 control connection.
|
|
# This function validates that all connections execept the control one use correct scheduling group.
|
|
def verify_scheduling_group_assignment(cql, user, target_scheduling_group, shard_count):
|
|
shards_with_correct_sg = set()
|
|
connections = cql.execute(f"SELECT username, scheduling_group, shard_id FROM system.clients WHERE client_type='cql' AND username='{user}' ALLOW FILTERING")
|
|
|
|
for conn in connections:
|
|
if target_scheduling_group in conn.scheduling_group:
|
|
shards_with_correct_sg.add(conn.shard_id)
|
|
|
|
assert len(shards_with_correct_sg) == shard_count, (f"Not all user '{user}' connections are working under target scheduling group '{target_scheduling_group}'."
|
|
f"Shards with correct scehduling group: {shards_with_correct_sg}, shard")
|
|
|
|
# Test if `/service_levels/count_connections` prints counted CQL connections
|
|
# per scheduling group per user.
|
|
def test_count_opened_cql_connections(cql):
|
|
user = f"test_user_{unique_name()}"
|
|
sl = f"sl_{unique_name()}"
|
|
|
|
cql.execute(f"CREATE ROLE {user} WITH login = true AND password='{user}'")
|
|
cql.execute(f"CREATE SERVICE LEVEL {sl} WITH shares = 100")
|
|
cql.execute(f"ATTACH SERVICE LEVEL {sl} TO {user}")
|
|
read_barrier(cql)
|
|
|
|
try:
|
|
with new_session(cql, user):
|
|
wait_for_clients(cql, user, 3) # 3 from smp=2 + control connection
|
|
wait_until_all_connections_authenticated(cql)
|
|
verify_scheduling_group_assignment(cql, user, sl, get_shard_count(cql))
|
|
|
|
api_response = count_opened_connections(cql)
|
|
assert f"sl:{sl}" in api_response
|
|
assert user in api_response[f"sl:{sl}"]
|
|
|
|
table_response = count_opened_connections_from_table(cql)
|
|
assert api_response == table_response
|
|
finally:
|
|
cql.execute(f"DETACH SERVICE LEVEL FROM {user}")
|
|
cql.execute(f"DROP ROLE {user}")
|
|
cql.execute(f"DROP SERVICE LEVEL {sl}")
|
|
|
|
# Test if `/service_levels/switch_tenants` updates scheduling group
|
|
# of CQL connections without restarting them.
|
|
#
|
|
# This test creates a `test_user` and 2 service levels `sl1` and `sl2`.
|
|
# Firstly the user is assigned to `sl1` and his connections is created.
|
|
# Then the test changes user's service level to `sl2` and
|
|
# `/service_levels/switch_tenants` endpoint is called.
|
|
def test_switch_tenants(cql):
|
|
user = f"test_user_{unique_name()}"
|
|
sl1 = f"sl1_{unique_name()}"
|
|
sl2 = f"sl2_{unique_name()}"
|
|
shard_count = get_shard_count(cql)
|
|
|
|
cql.execute(f"CREATE ROLE {user} WITH login = true AND password='{user}' AND superuser = true")
|
|
cql.execute(f"CREATE SERVICE LEVEL {sl1} WITH shares = 100")
|
|
cql.execute(f"CREATE SERVICE LEVEL {sl2} WITH shares = 200")
|
|
cql.execute(f"ATTACH SERVICE LEVEL {sl1} TO {user}")
|
|
read_barrier(cql)
|
|
|
|
try:
|
|
with new_session(cql, user) as user_session:
|
|
wait_for_clients(cql, user, 3) # 3 from smp=2 + control connection
|
|
wait_until_all_connections_authenticated(cql)
|
|
verify_scheduling_group_assignment(cql, user, sl1, shard_count)
|
|
|
|
cql.execute(f"DETACH SERVICE LEVEL FROM {user}")
|
|
cql.execute(f"ATTACH SERVICE LEVEL {sl2} TO {user}")
|
|
read_barrier(cql)
|
|
|
|
switch_tenants(cql)
|
|
# Switching tenants may be blocked if a connection is waiting for a request (see 'generic_server::connection::process_until_tenant_switch()').
|
|
# Execute enough cheap statements, so that connection on each shard will process at one statement and update its tenant.
|
|
for _ in range(100):
|
|
read_barrier(user_session)
|
|
verify_scheduling_group_assignment(cql, user, sl2, shard_count)
|
|
finally:
|
|
cql.execute(f"DETACH SERVICE LEVEL FROM {user}")
|
|
cql.execute(f"DROP ROLE {user}")
|
|
cql.execute(f"DROP SERVICE LEVEL {sl1}")
|
|
cql.execute(f"DROP SERVICE LEVEL {sl2}")
|