From 4c60804c4c6b6e28cc59e13897b4f80706aabaeb Mon Sep 17 00:00:00 2001 From: Eliran Sinvani Date: Mon, 25 Dec 2023 16:19:34 +0200 Subject: [PATCH 1/2] rest api: Add an api for profile dumping As part of code coverage support we need to work with dumped profiles for ScyllaDB executables. Those profiles are created on two occasions: 1. When an application exits notmaly (which will trigger __llvm_dump_profile registered in the exit hooks. 2. For ScyllaDB commit d7b524cf10 introduced a manual call to __llvm_dump_profile upon receiving a SIGTERM signal. This commit adds a third option, a rest API to dump the profile. In addition the target file is logged and the counters are reset, which enables incremental dumping of the profile. Except for logging, if the executable is not instrumented, this API call becomes a no-op so it bears minimal risk in keeping it in our releases. Specifically for code coverage, the gain will be that we will not be required to change the entire test run to shut down clusters gracefully and this will cause minimal effect to the actual test behavior. The change was tested by manually triggering the API in with and without instrumentation as well as re triggering it with write permissions for the profile file disabled (to test fault tolerance). Signed-off-by: Eliran Sinvani --- api/api-doc/system.json | 15 +++++++++++++++ api/system.cc | 25 +++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/api/api-doc/system.json b/api/api-doc/system.json index 8ac40f2879..2ef3d44b34 100644 --- a/api/api-doc/system.json +++ b/api/api-doc/system.json @@ -179,6 +179,21 @@ ] } ] + }, + { + "path":"/system/dump_llvm_profile", + "operations":[ + { + "method":"POST", + "summary":"Dump llvm profile data (raw profile data) that can later be used for coverage reporting or PGO (no-op if the current binary is not instrumented)", + "type":"void", + "nickname":"dump_profile", + "produces":[ + "application/json" + ], + "parameters":[] + } + ] } ] } diff --git a/api/system.cc b/api/system.cc index e6f6f6d000..a59b193a30 100644 --- a/api/system.cc +++ b/api/system.cc @@ -30,6 +30,10 @@ using namespace seastar::httpd; namespace hs = httpd::system_json; namespace hm = httpd::metrics_json; +extern "C" void __attribute__((weak)) __llvm_profile_dump(); +extern "C" const char * __attribute__((weak)) __llvm_profile_get_filename(); +extern "C" void __attribute__((weak)) __llvm_profile_reset_counters(); + void set_system(http_context& ctx, routes& r) { hm::get_metrics_config.set(r, [](const_req req) { std::vector res; @@ -158,6 +162,27 @@ void set_system(http_context& ctx, routes& r) { return json::json_return_type(json::json_void()); }); }); + + hs::dump_profile.set(r, [](std::unique_ptr req) { + if (!__llvm_profile_dump) { + apilog.info("Profile will not be dumped, executable is not instrumented with profile dumping."); + return make_ready_future(json::json_return_type(json::json_void())); + } + sstring profile_dest(__llvm_profile_get_filename ? __llvm_profile_get_filename() : "disk"); + apilog.info("Dumping profile to {}", profile_dest); + __llvm_profile_dump(); + if (__llvm_profile_reset_counters) { + // If counters are not reset the profile dumping mechanism will issue a warning and exit + // next time it is attempted. If the counters are reset, profiles can be accumulated + // (if %m is present in LLVM_PROFILE_FILE pattern) so it can be dumped in stages or + // multiple times during runtime. + __llvm_profile_reset_counters(); + } else { + apilog.warn("Could not reset profile counters, profile dumping will be skipped next time it is attempted"); + } + apilog.info("Profile dumped to {}", profile_dest); + return make_ready_future(json::json_return_type(json::json_void())); + }) ; } } From e49b3ffc897e183650991f5a137d9d908677f45b Mon Sep 17 00:00:00 2001 From: Eliran Sinvani Date: Mon, 25 Dec 2023 16:25:14 +0200 Subject: [PATCH 2/2] test.py: Dump coverage profile before killing a node Up until now the only way to get a coverage profile was to shut down the ScyllaDB nodes gracefully (using SIGTERM), this means that the coverage profile was lost for every node that was killed abruptly (SIGKILL). This in turn would have been requiring us to shut down all nodes gracefully which is not something we set out to do. Here we use the rest API for dumping the coverage profile which will cause the most minimal impact possible on the test runs. If the dumping fails (due to the node doesn't support the API or due to a real error in dumping we ignore it as it is not part of the system we would like to test. Signed-off-by: Eliran Sinvani --- test/pylib/rest_client.py | 6 +++++- test/pylib/scylla_cluster.py | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/test/pylib/rest_client.py b/test/pylib/rest_client.py index b237eda12d..3c92ba0a3f 100644 --- a/test/pylib/rest_client.py +++ b/test/pylib/rest_client.py @@ -271,7 +271,11 @@ class ScyllaRESTAPIClient(): if table is not None: url += "?cf={table}" await self.client.post(url, host=node_ip) - + async def dump_llvm_profile(self, node_ip : str): + """Dump llvm profile to disk that can later be used for PGO or coverage reporting. + no-op if the scylla binary is not instrumented.""" + url = "/system/dump_llvm_profile" + await self.client.post(url, host=node_ip) class ScyllaMetrics: def __init__(self, lines: list[str]): diff --git a/test/pylib/scylla_cluster.py b/test/pylib/scylla_cluster.py index 69ccfa08e5..ad9d37870a 100644 --- a/test/pylib/scylla_cluster.py +++ b/test/pylib/scylla_cluster.py @@ -503,6 +503,14 @@ class ScyllaServer: if not self.cmd: return + # Dump the profile if exists and supported by the API. + try: + api = ScyllaRESTAPIClient() + await api.dump_llvm_profile(self.ip_addr) + except: + # since it is not part of the test functionality, allow + # this step to fail unconditionally. + pass await self.shutdown_control_connection() try: self.cmd.kill()