mirror of
https://github.com/scylladb/scylladb.git
synced 2026-04-21 00:50:35 +00:00
Online discard asks the disk to erase flash memory cells as soon as files are deleted. This gives the disk more freedom to choose where to place new files, so it improves performance. On older kernel versions, and on really bad disks, this can reduce performance so we add an option to disable it. Since fstrim is pointless when online discard is enabled, we don't configure it if online discard is selected. I tested it on an AWS i3.large instance, the flag showd up in `mount` after configuration. Closes #9608
219 lines
8.0 KiB
Python
Executable File
219 lines
8.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright 2018-present ScyllaDB
|
|
#
|
|
|
|
#
|
|
# This file is part of Scylla.
|
|
#
|
|
# Scylla is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU Affero General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# Scylla is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with Scylla. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
import os
|
|
import argparse
|
|
import pwd
|
|
import grp
|
|
import sys
|
|
import stat
|
|
import distro
|
|
from pathlib import Path
|
|
from scylla_util import *
|
|
from subprocess import run
|
|
|
|
if __name__ == '__main__':
|
|
if os.getuid() > 0:
|
|
print('Requires root permission.')
|
|
sys.exit(1)
|
|
parser = argparse.ArgumentParser(description='Configure RAID volume for Scylla.')
|
|
parser.add_argument('--disks', required=True,
|
|
help='specify disks for RAID')
|
|
parser.add_argument('--raiddev',
|
|
help='MD device name for RAID')
|
|
parser.add_argument('--enable-on-nextboot', '--update-fstab', action='store_true', default=False,
|
|
help='mount RAID on next boot')
|
|
parser.add_argument('--root', default='/var/lib/scylla',
|
|
help='specify the root of the tree')
|
|
parser.add_argument('--volume-role', default='all',
|
|
help='specify how will this device be used (data, commitlog, or all)')
|
|
parser.add_argument('--force-raid', action='store_true', default=False,
|
|
help='force constructing RAID when only one disk is specified')
|
|
parser.add_argument('--raid-level', default='0',
|
|
help='specify RAID level')
|
|
parser.add_argument('--online-discard', default=True,
|
|
help='Enable XFS online discard (trim SSD cells after file deletion)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
root = args.root.rstrip('/')
|
|
if args.volume_role == 'all':
|
|
mount_at=root
|
|
elif args.volume_role == 'data':
|
|
mount_at='{}/data'.format(root)
|
|
elif args.volume_role == 'commitlog':
|
|
mount_at='{}/commitlog'.format(root)
|
|
else:
|
|
print('Invalid role specified ({})'.format(args.volume_role))
|
|
parser.print_help()
|
|
sys.exit(1)
|
|
|
|
disks = args.disks.split(',')
|
|
for disk in disks:
|
|
if not os.path.exists(disk):
|
|
print('{} is not found'.format(disk))
|
|
sys.exit(1)
|
|
if not stat.S_ISBLK(os.stat(disk).st_mode):
|
|
print('{} is not block device'.format(disk))
|
|
sys.exit(1)
|
|
if not is_unused_disk(disk):
|
|
print('{} is busy'.format(disk))
|
|
sys.exit(1)
|
|
|
|
if len(disks) == 1 and not args.force_raid:
|
|
raid = False
|
|
fsdev = disks[0]
|
|
else:
|
|
raid = True
|
|
if args.raiddev is None:
|
|
raiddevs_to_try = [f'/dev/md{i}' for i in range(10)]
|
|
else:
|
|
raiddevs_to_try = [args.raiddev, ]
|
|
for fsdev in raiddevs_to_try:
|
|
raiddevname = os.path.basename(fsdev)
|
|
array_state = Path(f'/sys/block/{raiddevname}/md/array_state')
|
|
# mdX is not allocated
|
|
if not array_state.exists():
|
|
break
|
|
with array_state.open() as f:
|
|
# allocated, but no devices, not running
|
|
if f.read().strip() == 'clear':
|
|
break
|
|
print(f'{fsdev} is already using')
|
|
else:
|
|
if args.raiddev is None:
|
|
print("Can't find unused /dev/mdX")
|
|
sys.exit(1)
|
|
print(f'{fsdev} will be used to setup a RAID')
|
|
|
|
if os.path.ismount(mount_at):
|
|
print('{} is already mounted'.format(mount_at))
|
|
sys.exit(1)
|
|
|
|
mntunit_bn = run('systemd-escape -p --suffix=mount {}'.format(mount_at), shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
|
|
mntunit = Path('/etc/systemd/system/{}'.format(mntunit_bn))
|
|
if mntunit.exists():
|
|
print('mount unit {} already exists'.format(mntunit))
|
|
sys.exit(1)
|
|
|
|
if not shutil.which('mkfs.xfs'):
|
|
pkg_install('xfsprogs')
|
|
if not shutil.which('mdadm'):
|
|
pkg_install('mdadm')
|
|
try:
|
|
md_service = systemd_unit('mdmonitor.service')
|
|
except SystemdException:
|
|
md_service = systemd_unit('mdadm.service')
|
|
|
|
print('Creating {type} for scylla using {nr_disk} disk(s): {disks}'.format(type='fRAID{args.raid_level}' if raid else 'XFS volume', nr_disk=len(disks), disks=args.disks))
|
|
procs=[]
|
|
for disk in disks:
|
|
d = disk.replace('/dev/', '')
|
|
discard_path = '/sys/block/{}/queue/discard_granularity'.format(d)
|
|
if os.path.exists(discard_path):
|
|
with open(discard_path) as f:
|
|
discard = f.read().strip()
|
|
if discard != '0':
|
|
proc = subprocess.Popen(['blkdiscard', disk])
|
|
procs.append(proc)
|
|
for proc in procs:
|
|
proc.wait()
|
|
if raid:
|
|
run('udevadm settle', shell=True, check=True)
|
|
run('mdadm --create --verbose --force --run {raid} --level={level} -c1024 --raid-devices={nr_disk} {disks}'.format(raid=fsdev, level=args.raid_level, nr_disk=len(disks), disks=args.disks.replace(',', ' ')), shell=True, check=True)
|
|
run('udevadm settle', shell=True, check=True)
|
|
|
|
major_minor = os.stat(fsdev).st_rdev
|
|
major, minor = major_minor // 256, major_minor % 256
|
|
sector_size = int(open(f'/sys/dev/block/{major}:{minor}/queue/logical_block_size').read())
|
|
# We want smaller block sizes to allow smaller commitlog writes without
|
|
# stalling. The minimum block size for crc enabled filesystems is 1024,
|
|
# and it also cannot be smaller than the sector size.
|
|
block_size = max(1024, sector_size)
|
|
run(f'mkfs.xfs -b size={block_size} {fsdev} -f -K', shell=True, check=True)
|
|
|
|
if is_debian_variant():
|
|
confpath = '/etc/mdadm/mdadm.conf'
|
|
else:
|
|
confpath = '/etc/mdadm.conf'
|
|
|
|
if raid:
|
|
res = run('mdadm --detail --scan', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
|
|
with open(confpath, 'w') as f:
|
|
f.write(res)
|
|
f.write('\nMAILADDR root')
|
|
|
|
os.makedirs(mount_at, exist_ok=True)
|
|
|
|
uuid = run(f'blkid -s UUID -o value {fsdev}', shell=True, check=True, capture_output=True, encoding='utf-8').stdout.strip()
|
|
after = 'local-fs.target'
|
|
if raid:
|
|
after += f' {md_service}'
|
|
opt_discard = ''
|
|
if args.online_discard:
|
|
opt_discard = ',discard'
|
|
unit_data = f'''
|
|
[Unit]
|
|
Description=Scylla data directory
|
|
Before=scylla-server.service
|
|
After={after}
|
|
Wants={md_service}
|
|
DefaultDependencies=no
|
|
|
|
[Mount]
|
|
What=/dev/disk/by-uuid/{uuid}
|
|
Where={mount_at}
|
|
Type=xfs
|
|
Options=noatime{opt_discard}
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
'''[1:-1]
|
|
with open(f'/etc/systemd/system/{mntunit_bn}', 'w') as f:
|
|
f.write(unit_data)
|
|
mounts_conf = '/etc/systemd/system/scylla-server.service.d/mounts.conf'
|
|
if not os.path.exists(mounts_conf):
|
|
os.makedirs('/etc/systemd/system/scylla-server.service.d/', exist_ok=True)
|
|
with open(mounts_conf, 'w') as f:
|
|
f.write(f'[Unit]\nRequiresMountsFor={mount_at}\n')
|
|
else:
|
|
with open(mounts_conf, 'a') as f:
|
|
f.write(f'RequiresMountsFor={mount_at}\n')
|
|
|
|
systemd_unit.reload()
|
|
md_service.start()
|
|
mount = systemd_unit(mntunit_bn)
|
|
mount.start()
|
|
if args.enable_on_nextboot:
|
|
mount.enable()
|
|
uid = pwd.getpwnam('scylla').pw_uid
|
|
gid = grp.getgrnam('scylla').gr_gid
|
|
os.chown(root, uid, gid)
|
|
|
|
for d in ['coredump', 'data', 'commitlog', 'hints', 'view_hints', 'saved_caches']:
|
|
dpath = '{}/{}'.format(root, d)
|
|
os.makedirs(dpath, exist_ok=True)
|
|
os.chown(dpath, uid, gid)
|
|
|
|
if is_debian_variant():
|
|
run('update-initramfs -u', shell=True, check=True)
|