From a0ff33e77774714b381ca3bcf2bebeb754d4850e Mon Sep 17 00:00:00 2001 From: Kamil Braun Date: Mon, 23 Jan 2023 11:59:09 +0100 Subject: [PATCH] test/pylib: scylla_cluster: don't leak server if stopping it fails `ScyllaCluster.server_stop` had this piece of code: ``` server = self.running.pop(server_id) if gracefully: await server.stop_gracefully() else: await server.stop() self.stopped[server_id] = server ``` We observed `stop_gracefully()` failing due to a server hanging during shutdown. We then ended up in a state where neither `self.running` nor `self.stopped` had this server. Later, when releasing the cluster and its IPs, we would release that server's IP - but the server might have still been running (all servers in `self.running` are killed before releasing IPs, but this one wasn't in `self.running`). Fix this by popping the server from `self.running` only after `stop_gracefully`/`stop` finishes. Make an analogous fix in `server_start`: put `server` into `self.running` *before* we actually start it. If the start fails, the server will be considered "running" even though it isn't necessarily, but that is OK - if it isn't running, then trying to stop it later will simply do nothing; if it is actually running, we will kill it (which we should do) when clearing after the cluster; and we don't leak it. Closes #12613 --- test/pylib/scylla_cluster.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/test/pylib/scylla_cluster.py b/test/pylib/scylla_cluster.py index 420c274389..293ef90c69 100644 --- a/test/pylib/scylla_cluster.py +++ b/test/pylib/scylla_cluster.py @@ -757,11 +757,14 @@ class ScyllaCluster: if server_id not in self.running: return ScyllaCluster.ActionReturn(success=False, msg=f"Server {server_id} unknown") self.is_dirty = True - server = self.running.pop(server_id) + server = self.running[server_id] + # Remove the server from `running` only after we successfully stop it. + # Stopping may fail and if we removed it from `running` now it might leak. if gracefully: await server.stop_gracefully() else: await server.stop() + self.running.pop(server_id) self.stopped[server_id] = server return ScyllaCluster.ActionReturn(success=True, msg=f"{server} stopped") @@ -781,8 +784,10 @@ class ScyllaCluster: self.is_dirty = True server = self.stopped.pop(server_id) server.seeds = self._seeds() - await server.start(self.api) + # Put the server in `running` before starting it. + # Starting may fail and if we didn't add it now it might leak. self.running[server_id] = server + await server.start(self.api) return ScyllaCluster.ActionReturn(success=True, msg=f"{server} started") async def server_restart(self, server_id: ServerNum) -> ActionReturn: