Files
scylladb/dist/common/scripts/scylla_sysconfig_setup
Takuya ASADA eb30594a60 dist: detect corrupted NUMA topology information
There are some environment which has corrupted NUMA topology
information, such as some instance types on AWS EC2 with specific Linux
kernel images.
On such environment, we cannot get HW information correctly from hwloc,
so we cannot proceed optimization on perftune.
To avoid causing script error, check NUMA topology information and skip
running perftune if the information corrupted.

Related scylladb/seastar#2925

Closes scylladb/scylladb#26344
2025-10-22 01:11:14 +03:00

132 lines
5.2 KiB
Python
Executable File

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2018-present ScyllaDB
#
#
# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
import os
import sys
import argparse
import subprocess
import re
from scylla_util import *
from subprocess import run
def bool2str(val):
return 'yes' if val else 'no'
def str2bool(val):
return True if val == 'yes' else False
if __name__ == '__main__':
if os.getuid() > 0:
print('Requires root permission.')
sys.exit(1)
cfg = sysconfig_parser(sysconfdir_p() / 'scylla-server')
set_nic_and_disks = str2bool(get_set_nic_and_disks_config_value(cfg))
if cfg.has_option('SET_CLOCKSOURCE'):
set_clocksource = str2bool(cfg.get('SET_CLOCKSOURCE'))
else:
set_clocksource = 'no'
if cfg.has_option('DISABLE_WRITEBACK_CACHE'):
disable_writeback_cache = str2bool(cfg.get('DISABLE_WRITEBACK_CACHE'))
else:
disable_writeback_cache = 'no'
parser = argparse.ArgumentParser(description='Setting parameters on Scylla sysconfig file.')
parser.add_argument('--nic',
help='specify NIC')
parser.add_argument('--mode',
help='network mode (posix, dpdk)')
parser.add_argument('--nr-hugepages', type=int,
help='number of hugepages')
parser.add_argument('--user',
help='user (dpdk requires root)')
parser.add_argument('--group',
help='group (dpdk requires root)')
parser.add_argument('--homedir',
help='scylla home directory')
parser.add_argument('--confdir',
help='scylla config directory')
parser.add_argument('--setup-nic-and-disks', action='store_true', default=set_nic_and_disks,
help='setup NIC\'s and disks\' interrupts, RPS, XPS, nomerges and I/O scheduler')
parser.add_argument('--set-clocksource', action='store_true', default=set_clocksource,
help='Set enforcing fastest available Linux clocksource')
parser.add_argument('--disable-writeback-cache', action='store_true', default=disable_writeback_cache,
help='Disable disk writeback cache')
args = parser.parse_args()
if args.nic and not is_valid_nic(args.nic):
print('NIC {} not found.'.format(args.nic))
sys.exit(1)
ifname = args.nic if args.nic else cfg.get('IFNAME')
network_mode = args.mode if args.mode else cfg.get('NETWORK_MODE')
if args.setup_nic_and_disks:
if not check_sysfs_numa_topology_is_valid():
print('NUMA topology information is corrupted, not able to enable perftune feature.')
sys.exit(3)
res = out('{} --tune net --nic {} --get-cpu-mask'.format(perftune_base_command(), ifname))
# we need to extract CPU mask from output, since perftune.py may also print warning messages (#10082)
match = re.match('(.*\n)?(0x[0-9a-f]+(?:,0x[0-9a-f]+)*)', res, re.DOTALL)
try:
warning = match.group(1)
rps_cpus = match.group(2)
except:
raise Exception(f'Failed to retrieve CPU mask: {res}')
# print warning message if available
if warning:
print(warning.strip())
if len(rps_cpus) > 0:
cpuset = hex2list(rps_cpus)
run('/opt/scylladb/scripts/scylla_cpuset_setup --cpuset {}'.format(cpuset), shell=True, check=True)
ethdrv = ''
ethpciid = ''
if network_mode == 'dpdk':
dpdk_status = out('/opt/scylladb/scripts/dpdk-devbind.py --status')
match = re.search(r'if={} drv=(\S+)'.format(ifname), dpdk_status, flags=re.MULTILINE)
ethdrv = match.group(1)
match = re.search(r'^(\S+:\S+:\S+\.\S+) [^\n]+ if={} '.format(ifname), dpdk_status, flags=re.MULTILINE)
ethpciid = match.group(1)
if args.mode:
cfg.set('NETWORK_MODE', args.mode)
if args.nic:
cfg.set('IFNAME', args.nic)
if cfg.get('ETHDRV') != ethdrv:
cfg.set('ETHDRV', ethdrv)
if cfg.get('ETHPCIID') != ethpciid:
cfg.set('ETHPCIID', ethpciid)
if args.nr_hugepages:
cfg.set('NR_HUGEPAGES', args.nr_hugepages)
if args.user:
cfg.set('USER', args.user)
if args.group:
cfg.set('GROUP', args.group)
if args.homedir:
cfg.set('SCYLLA_HOME', args.homedir)
if args.confdir:
cfg.set('SCYLLA_CONF', args.confdir)
if str2bool(get_set_nic_and_disks_config_value(cfg)) != args.setup_nic_and_disks:
if cfg.has_option('SET_NIC'):
cfg.set('SET_NIC', bool2str(args.setup_nic_and_disks))
else:
cfg.set('SET_NIC_AND_DISKS', bool2str(args.setup_nic_and_disks))
if cfg.has_option('SET_CLOCKSOURCE') and str2bool(cfg.get('SET_CLOCKSOURCE')) != args.set_clocksource:
if not check_sysfs_numa_topology_is_valid():
print('NUMA topology information is corrupted, not able to enable perftune feature.')
sys.exit(3)
cfg.set('SET_CLOCKSOURCE', bool2str(args.set_clocksource))
if cfg.has_option('DISABLE_WRITEBACK_CACHE') and str2bool(cfg.get('DISABLE_WRITEBACK_CACHE')) != args.disable_writeback_cache:
cfg.set('DISABLE_WRITEBACK_CACHE', bool2str(args.disable_writeback_cache))
cfg.commit()