scylladb/dist/common/scripts/scylla_raid_setup

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Copyright 2018-present ScyllaDB
#

#
# SPDX-License-Identifier: AGPL-3.0-or-later

import os
import argparse
import pwd
import grp
import sys
import stat
import distro
from pathlib import Path
from scylla_util import *
from subprocess import run, SubprocessError

if __name__ == '__main__':
    if os.getuid() > 0:
        print('Requires root permission.')
        sys.exit(1)
    parser = argparse.ArgumentParser(description='Configure RAID volume for Scylla.')
    parser.add_argument('--disks', required=True,
                        help='specify disks for RAID')
    parser.add_argument('--raiddev',
                        help='MD device name for RAID')
    parser.add_argument('--enable-on-nextboot', '--update-fstab', action='store_true', default=False,
                        help='mount RAID on next boot')
    parser.add_argument('--root', default='/var/lib/scylla',
                        help='specify the root of the tree')
    parser.add_argument('--volume-role', default='all',
                        help='specify how will this device be used (data, commitlog, or all)')
    parser.add_argument('--force-raid', action='store_true', default=False,
                        help='force constructing RAID when only one disk is specified')
    parser.add_argument('--raid-level', default='0',
                        help='specify RAID level')
    parser.add_argument('--online-discard', default=True,
                        help='Enable XFS online discard (trim SSD cells after file deletion)')

    args = parser.parse_args()

    root = args.root.rstrip('/')
    if args.volume_role == 'all':
        mount_at=root
    elif args.volume_role == 'data':
        mount_at='{}/data'.format(root)
    elif args.volume_role == 'commitlog':
        mount_at='{}/commitlog'.format(root)
    else:
        print('Invalid role specified ({})'.format(args.volume_role))
        parser.print_help()
        sys.exit(1)

    disks = args.disks.split(',')
    for disk in disks:
        if not os.path.exists(disk):
            print('{} is not found'.format(disk))
            sys.exit(1)
        if not stat.S_ISBLK(os.stat(disk).st_mode):
            print('{} is not block device'.format(disk))
            sys.exit(1)
        if not is_unused_disk(disk):
            print('{} is busy'.format(disk))
            sys.exit(1)

    if len(disks) == 1 and not args.force_raid:
        raid = False
        fsdev = disks[0]
    else:
        raid = True
        if args.raiddev is None:
            raiddevs_to_try = [f'/dev/md{i}' for i in range(10)]
        else:
            raiddevs_to_try = [args.raiddev, ]
        for fsdev in raiddevs_to_try:
            raiddevname = os.path.basename(fsdev)
            array_state = Path(f'/sys/block/{raiddevname}/md/array_state')
            # mdX is not allocated
            if not array_state.exists():
                break
            with array_state.open() as f:
                # allocated, but no devices, not running
                if f.read().strip() == 'clear':
                    break
            print(f'{fsdev} is already using')
        else:
            if args.raiddev is None:
                print("Can't find unused /dev/mdX")
            sys.exit(1)
        print(f'{fsdev} will be used to setup a RAID')

    if os.path.ismount(mount_at):
        print('{} is already mounted'.format(mount_at))
        sys.exit(1)

    mntunit_bn = out('systemd-escape -p --suffix=mount {}'.format(mount_at))
    mntunit = Path('/etc/systemd/system/{}'.format(mntunit_bn))
    if mntunit.exists():
        print('mount unit {} already exists'.format(mntunit))
        sys.exit(1)

    if not shutil.which('mkfs.xfs'):
        pkg_install('xfsprogs')
    if not shutil.which('mdadm'):
        pkg_install('mdadm')
    if args.raid_level != '0':
        try:
            md_service = systemd_unit('mdmonitor.service')
        except SystemdException:
            md_service = systemd_unit('mdadm.service')

    print('Creating {type} for scylla using {nr_disk} disk(s): {disks}'.format(type=f'RAID{args.raid_level}' if raid else 'XFS volume', nr_disk=len(disks), disks=args.disks))
    procs=[]
    for disk in disks:
        d = disk.replace('/dev/', '')
        discard_path = '/sys/block/{}/queue/discard_granularity'.format(d)
        if os.path.exists(discard_path):
            with open(discard_path) as f:
                discard = f.read().strip()
            if discard != '0':
                proc = subprocess.Popen(['blkdiscard', disk])
                procs.append(proc)
    for proc in procs:
        proc.wait()
    for disk in disks:
        run(f'wipefs -a {disk}', shell=True, check=True)
    if raid:
        run('udevadm settle', shell=True, check=True)
        run('mdadm --create --verbose --force --run {raid} --level={level} -c1024 --raid-devices={nr_disk} {disks}'.format(raid=fsdev, level=args.raid_level, nr_disk=len(disks), disks=args.disks.replace(',', ' ')), shell=True, check=True)
        run(f'wipefs -a {fsdev}', shell=True, check=True)
        run('udevadm settle', shell=True, check=True)

    major_minor = os.stat(fsdev).st_rdev
    major, minor = major_minor // 256, major_minor % 256
    sector_size = int(open(f'/sys/dev/block/{major}:{minor}/queue/logical_block_size').read())
    # We want smaller block sizes to allow smaller commitlog writes without
    # stalling. The minimum block size for crc enabled filesystems is 1024,
    # and it also cannot be smaller than the sector size.
    block_size = max(1024, sector_size)
    run('udevadm settle', shell=True, check=True)
    run(f'mkfs.xfs -b size={block_size} {fsdev} -K', shell=True, check=True)
    run('udevadm settle', shell=True, check=True)

    if is_debian_variant():
        confpath = '/etc/mdadm/mdadm.conf'
    else:
        confpath = '/etc/mdadm.conf'

    if raid:
        res = out('mdadm --detail --scan')
        with open(confpath, 'w') as f:
            f.write(res)
            f.write('\nMAILADDR root')

    os.makedirs(mount_at, exist_ok=True)

    uuid = out(f'blkid -s UUID -o value {fsdev}')
    if not uuid:
        raise Exception(f'Failed to get UUID of {fsdev}')

    uuidpath = f'/dev/disk/by-uuid/{uuid}'

    after = 'local-fs.target'
    wants = ''
    if raid and args.raid_level != '0':
        after += f' {md_service}'
        wants = f'\nWants={md_service}'
    opt_discard = ''
    if args.online_discard:
        opt_discard = ',discard'
    unit_data = f'''
[Unit]
Description=Scylla data directory
Before=scylla-server.service
After={after}{wants}
DefaultDependencies=no

[Mount]
What={uuidpath}
Where={mount_at}
Type=xfs
Options=noatime{opt_discard}

[Install]
WantedBy=multi-user.target
'''[1:-1]
    with open(f'/etc/systemd/system/{mntunit_bn}', 'w') as f:
        f.write(unit_data)
    mounts_conf = '/etc/systemd/system/scylla-server.service.d/mounts.conf'
    if not os.path.exists(mounts_conf):
        os.makedirs('/etc/systemd/system/scylla-server.service.d/', exist_ok=True)
        with open(mounts_conf, 'w') as f:
            f.write(f'[Unit]\nRequiresMountsFor={mount_at}\n')
    else:
        with open(mounts_conf, 'a') as f:
            f.write(f'RequiresMountsFor={mount_at}\n')

    systemd_unit.reload()
    if args.raid_level != '0':
        md_service.start()
    try:
        mount = systemd_unit(mntunit_bn)
        mount.start()
    except SubprocessError as e:
        if not os.path.exists(uuidpath):
            print(f'\nERROR: {uuidpath} is not found\n')
        elif not stat.S_ISBLK(os.stat(uuidpath).st_mode):
            print(f'\nERROR: {uuidpath} is not block device\n')
        raise e

    if args.enable_on_nextboot:
        mount.enable()
    uid = pwd.getpwnam('scylla').pw_uid
    gid = grp.getgrnam('scylla').gr_gid
    os.chown(root, uid, gid)

    for d in ['coredump', 'data', 'commitlog', 'hints', 'view_hints', 'saved_caches']:
        dpath = '{}/{}'.format(root, d)
        os.makedirs(dpath, exist_ok=True)
        os.chown(dpath, uid, gid)

    if is_debian_variant():
        run('update-initramfs -u', shell=True, check=True)