The discussion on the thread says, when we reformat a volume with another filesystem, kernel and libblkid may skip to populate /dev/disk/by-* since it detected two filesystem signatures, because mkfs.xxx did not cleared previous filesystem signature. To avoid this, we need to run wipefs before running mkfs. Note that this runs wipefs twice, for target disks and also for RAID device. wipefs for RAID device is needed since wipefs on disks doesn't clear filesystem signatures on /dev/mdX (we may see previous filesystem signature on /dev/mdX when we construct RAID volume multiple time on same disks). Also dropped -f option from mkfs.xfs, it will check wipefs is working as we expected. Fixes #13737 Signed-off-by: Takuya ASADA <syuu@scylladb.com> Closes #13738
227 lines
7.9 KiB
Python
Executable File
227 lines
7.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
#
|
|
# Copyright 2018-present ScyllaDB
|
|
#
|
|
|
|
#
|
|
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
|
|
import os
|
|
import argparse
|
|
import pwd
|
|
import grp
|
|
import sys
|
|
import stat
|
|
import distro
|
|
from pathlib import Path
|
|
from scylla_util import *
|
|
from subprocess import run, SubprocessError
|
|
|
|
if __name__ == '__main__':
|
|
if os.getuid() > 0:
|
|
print('Requires root permission.')
|
|
sys.exit(1)
|
|
parser = argparse.ArgumentParser(description='Configure RAID volume for Scylla.')
|
|
parser.add_argument('--disks', required=True,
|
|
help='specify disks for RAID')
|
|
parser.add_argument('--raiddev',
|
|
help='MD device name for RAID')
|
|
parser.add_argument('--enable-on-nextboot', '--update-fstab', action='store_true', default=False,
|
|
help='mount RAID on next boot')
|
|
parser.add_argument('--root', default='/var/lib/scylla',
|
|
help='specify the root of the tree')
|
|
parser.add_argument('--volume-role', default='all',
|
|
help='specify how will this device be used (data, commitlog, or all)')
|
|
parser.add_argument('--force-raid', action='store_true', default=False,
|
|
help='force constructing RAID when only one disk is specified')
|
|
parser.add_argument('--raid-level', default='0',
|
|
help='specify RAID level')
|
|
parser.add_argument('--online-discard', default=True,
|
|
help='Enable XFS online discard (trim SSD cells after file deletion)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
root = args.root.rstrip('/')
|
|
if args.volume_role == 'all':
|
|
mount_at=root
|
|
elif args.volume_role == 'data':
|
|
mount_at='{}/data'.format(root)
|
|
elif args.volume_role == 'commitlog':
|
|
mount_at='{}/commitlog'.format(root)
|
|
else:
|
|
print('Invalid role specified ({})'.format(args.volume_role))
|
|
parser.print_help()
|
|
sys.exit(1)
|
|
|
|
disks = args.disks.split(',')
|
|
for disk in disks:
|
|
if not os.path.exists(disk):
|
|
print('{} is not found'.format(disk))
|
|
sys.exit(1)
|
|
if not stat.S_ISBLK(os.stat(disk).st_mode):
|
|
print('{} is not block device'.format(disk))
|
|
sys.exit(1)
|
|
if not is_unused_disk(disk):
|
|
print('{} is busy'.format(disk))
|
|
sys.exit(1)
|
|
|
|
if len(disks) == 1 and not args.force_raid:
|
|
raid = False
|
|
fsdev = disks[0]
|
|
else:
|
|
raid = True
|
|
if args.raiddev is None:
|
|
raiddevs_to_try = [f'/dev/md{i}' for i in range(10)]
|
|
else:
|
|
raiddevs_to_try = [args.raiddev, ]
|
|
for fsdev in raiddevs_to_try:
|
|
raiddevname = os.path.basename(fsdev)
|
|
array_state = Path(f'/sys/block/{raiddevname}/md/array_state')
|
|
# mdX is not allocated
|
|
if not array_state.exists():
|
|
break
|
|
with array_state.open() as f:
|
|
# allocated, but no devices, not running
|
|
if f.read().strip() == 'clear':
|
|
break
|
|
print(f'{fsdev} is already using')
|
|
else:
|
|
if args.raiddev is None:
|
|
print("Can't find unused /dev/mdX")
|
|
sys.exit(1)
|
|
print(f'{fsdev} will be used to setup a RAID')
|
|
|
|
if os.path.ismount(mount_at):
|
|
print('{} is already mounted'.format(mount_at))
|
|
sys.exit(1)
|
|
|
|
mntunit_bn = out('systemd-escape -p --suffix=mount {}'.format(mount_at))
|
|
mntunit = Path('/etc/systemd/system/{}'.format(mntunit_bn))
|
|
if mntunit.exists():
|
|
print('mount unit {} already exists'.format(mntunit))
|
|
sys.exit(1)
|
|
|
|
if not shutil.which('mkfs.xfs'):
|
|
pkg_install('xfsprogs')
|
|
if not shutil.which('mdadm'):
|
|
pkg_install('mdadm')
|
|
if args.raid_level != '0':
|
|
try:
|
|
md_service = systemd_unit('mdmonitor.service')
|
|
except SystemdException:
|
|
md_service = systemd_unit('mdadm.service')
|
|
|
|
print('Creating {type} for scylla using {nr_disk} disk(s): {disks}'.format(type=f'RAID{args.raid_level}' if raid else 'XFS volume', nr_disk=len(disks), disks=args.disks))
|
|
procs=[]
|
|
for disk in disks:
|
|
d = disk.replace('/dev/', '')
|
|
discard_path = '/sys/block/{}/queue/discard_granularity'.format(d)
|
|
if os.path.exists(discard_path):
|
|
with open(discard_path) as f:
|
|
discard = f.read().strip()
|
|
if discard != '0':
|
|
proc = subprocess.Popen(['blkdiscard', disk])
|
|
procs.append(proc)
|
|
for proc in procs:
|
|
proc.wait()
|
|
for disk in disks:
|
|
run(f'wipefs -a {disk}', shell=True, check=True)
|
|
if raid:
|
|
run('udevadm settle', shell=True, check=True)
|
|
run('mdadm --create --verbose --force --run {raid} --level={level} -c1024 --raid-devices={nr_disk} {disks}'.format(raid=fsdev, level=args.raid_level, nr_disk=len(disks), disks=args.disks.replace(',', ' ')), shell=True, check=True)
|
|
run(f'wipefs -a {fsdev}', shell=True, check=True)
|
|
run('udevadm settle', shell=True, check=True)
|
|
|
|
major_minor = os.stat(fsdev).st_rdev
|
|
major, minor = major_minor // 256, major_minor % 256
|
|
sector_size = int(open(f'/sys/dev/block/{major}:{minor}/queue/logical_block_size').read())
|
|
# We want smaller block sizes to allow smaller commitlog writes without
|
|
# stalling. The minimum block size for crc enabled filesystems is 1024,
|
|
# and it also cannot be smaller than the sector size.
|
|
block_size = max(1024, sector_size)
|
|
run('udevadm settle', shell=True, check=True)
|
|
run(f'mkfs.xfs -b size={block_size} {fsdev} -K', shell=True, check=True)
|
|
run('udevadm settle', shell=True, check=True)
|
|
|
|
if is_debian_variant():
|
|
confpath = '/etc/mdadm/mdadm.conf'
|
|
else:
|
|
confpath = '/etc/mdadm.conf'
|
|
|
|
if raid:
|
|
res = out('mdadm --detail --scan')
|
|
with open(confpath, 'w') as f:
|
|
f.write(res)
|
|
f.write('\nMAILADDR root')
|
|
|
|
os.makedirs(mount_at, exist_ok=True)
|
|
|
|
uuid = out(f'blkid -s UUID -o value {fsdev}')
|
|
if not uuid:
|
|
raise Exception(f'Failed to get UUID of {fsdev}')
|
|
|
|
uuidpath = f'/dev/disk/by-uuid/{uuid}'
|
|
|
|
after = 'local-fs.target'
|
|
wants = ''
|
|
if raid and args.raid_level != '0':
|
|
after += f' {md_service}'
|
|
wants = f'\nWants={md_service}'
|
|
opt_discard = ''
|
|
if args.online_discard:
|
|
opt_discard = ',discard'
|
|
unit_data = f'''
|
|
[Unit]
|
|
Description=Scylla data directory
|
|
Before=scylla-server.service
|
|
After={after}{wants}
|
|
DefaultDependencies=no
|
|
|
|
[Mount]
|
|
What={uuidpath}
|
|
Where={mount_at}
|
|
Type=xfs
|
|
Options=noatime{opt_discard}
|
|
|
|
[Install]
|
|
WantedBy=multi-user.target
|
|
'''[1:-1]
|
|
with open(f'/etc/systemd/system/{mntunit_bn}', 'w') as f:
|
|
f.write(unit_data)
|
|
mounts_conf = '/etc/systemd/system/scylla-server.service.d/mounts.conf'
|
|
if not os.path.exists(mounts_conf):
|
|
os.makedirs('/etc/systemd/system/scylla-server.service.d/', exist_ok=True)
|
|
with open(mounts_conf, 'w') as f:
|
|
f.write(f'[Unit]\nRequiresMountsFor={mount_at}\n')
|
|
else:
|
|
with open(mounts_conf, 'a') as f:
|
|
f.write(f'RequiresMountsFor={mount_at}\n')
|
|
|
|
systemd_unit.reload()
|
|
if args.raid_level != '0':
|
|
md_service.start()
|
|
try:
|
|
mount = systemd_unit(mntunit_bn)
|
|
mount.start()
|
|
except SubprocessError as e:
|
|
if not os.path.exists(uuidpath):
|
|
print(f'\nERROR: {uuidpath} is not found\n')
|
|
elif not stat.S_ISBLK(os.stat(uuidpath).st_mode):
|
|
print(f'\nERROR: {uuidpath} is not block device\n')
|
|
raise e
|
|
|
|
if args.enable_on_nextboot:
|
|
mount.enable()
|
|
uid = pwd.getpwnam('scylla').pw_uid
|
|
gid = grp.getgrnam('scylla').gr_gid
|
|
os.chown(root, uid, gid)
|
|
|
|
for d in ['coredump', 'data', 'commitlog', 'hints', 'view_hints', 'saved_caches']:
|
|
dpath = '{}/{}'.format(root, d)
|
|
os.makedirs(dpath, exist_ok=True)
|
|
os.chown(dpath, uid, gid)
|
|
|
|
if is_debian_variant():
|
|
run('update-initramfs -u', shell=True, check=True)
|