diff --git a/doc/Makefile b/doc/Makefile index d2b95bd76..c1b59b8cc 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -36,7 +36,9 @@ rtf: $(RTFS) $(COMMAND)rtf $(<) clean: + mv "Using the DLM as a Distributed In-Memory Database.pdf" "Using the DLM as a Distributed In-Memory Database.pdf_" rm -f *.txt *.html *.tex *.dvi *.ps *.pdf *.info *.lyx *.rtf + mv "Using the DLM as a Distributed In-Memory Database.pdf_" "Using the DLM as a Distributed In-Memory Database.pdf" extraclean: clean rm -f *.orig *.rej diff --git a/doc/Using the DLM as a Distributed In-Memory Database.pdf b/doc/Using the DLM as a Distributed In-Memory Database.pdf new file mode 100644 index 000000000..19be45865 Binary files /dev/null and b/doc/Using the DLM as a Distributed In-Memory Database.pdf differ diff --git a/scst/README.dlm b/scst/README.dlm new file mode 100644 index 000000000..435e6d8bb --- /dev/null +++ b/scst/README.dlm @@ -0,0 +1,166 @@ +Synchronization of the Persistent Reservation Information via the DLM +===================================================================== + +Introduction +------------ + +In an H.A. setup where multiple servers share data it is required that +the persistent reservation state is kept consistent across the cluster. +One possible approach is to use the DLM to keep the PR state synchronized +across nodes. Since the DLM can associate data with each DLM lock object, +DLM lock objects can be used to store PR data. The data that is associated +with a DLM lock object is called the Lock Value Block or LVB. The code in +scst_dlm.c uses the DLM to keep PR data synchronized across all nodes in +a cluster. + +Software Components +------------------- + +The following software components are needed by the code in scst_dlm.c: +* The DLM kernel driver (dlm.ko). This driver is only built if CONFIG_DLM + has been set. +* The DLM control daemon (dlm_controld.pcmk). This daemon passes cluster + node IDs and IP addresses to the DLM kernel driver via the configfs + interface of the DLM kernel driver. +* Corosync to manage cluster membership of the cluster nodes and to assign + a node ID to each cluster node. +* A facility to start the DLM control daemon, e.g. Pacemaker. + +On most Linux distributions the software packages that contain this software +have the names kernel, dlm, corosync and pacemaker. + +DLM Configuration +----------------- + +The DLM kernel module supports the TCP and SCTP communication protocols. An +advantage of SCTP for H.A. purposes is that it supports multihoming. One of +these protocols can be selected via the -r option of dlm_controld. +That option can be set via the "args" argument of the Pacemaker dlm_controld +resource. For more information, see also: +* The dlm_controld(8) man page. +* In the "Pacemaker 1.1, Clusters from Scratch" guide, the section "Configure + the Cluster for the DLM". +* The dlm_controld resource agent: /usr/lib/ocf/resource.d/pacemaker/controld + +Here is an example of how to set up a cluster with two nodes and how to +configure and start the DLM control daemon: + 1. If a network switch is present between the two nodes, enable IPv4 multicast + on that switch. + 2. Copy /etc/corosync/corosync.conf.example into /etc/corosync/corosync.conf + and edit that file. + 3. If a file /etc/default/corosync exists, enable Corosync in that file. + 4. Start Corosync: + systemctl start corosync || /etc/init.d/corosync start + 5. Check that all configured Corosync rings have two members: + corosync-cfgtool -s && { corosync-cmapctl | grep members; } + 6. Start pcsd: + systemctl start pcsd || /etc/init.d/pcsd start + 7. Set up cluster authentication: + pcs cluster auth centos7-vm centos7b-vm + 8. Start Pacemaker: + systemctl start pacemaker || /etc/init.d/pacemaker start + 9. If the cluster has only two nodes, disable the Pacemaker quorum policy and + disable STONITH: + crm_attribute -t crm_config -n no-quorum-policy -v ignore + crm_attribute -t crm_config -n stonith-enabled -v false +10. Check the cluster status: + pcs status +11. Create a Pacemaker resource for dlm_controld: + pcs resource delete dlm + pcs resource create dlm ocf:pacemaker:controld \ + args="-q0 -f0" allow_stonith_disabled=true \ + op monitor timeout=60 \ + --clone interleave=true +12. Check the Pacemaker status: + pcs status + +Startup and Shutdown +-------------------- + +The startup sequence is as follows: +* Load and configure SCST with cluster_mode = 0 and with all target ports + disabled. +* Enable cluster mode for all SCST devices that can be accessed through more + than one cluster node: + for x in /sys/kernel/scst_tgt/handlers/*/*/; do + echo 1 >$x/cluster_mode & + done + wait +* Start Corosync and Pacemaker. +* Wait until Pacemaker has reached the idle state: + pacemaker_dc_status() { + local dc + + dc="$(crmadmin -D 2>/dev/null | sed 's/Designated Controller is: //')" + [ -n "$dc" ] && + crmadmin -S "$dc" 2>/dev/null | + sed 's/^Status of crmd@[^[:blank:]]*:[[:blank:]]\([^[:blank:]]*\).*/\1/' + } + for ((i=0;i<300;i++)); do + [ "$(pacemaker_dc_status)" = "S_IDLE" ] && break + sleep 1 + done +* Enable SCST target ports. +* If no DLM resource has been configured in Pacemaker, start dlm_controld.pcmk + explicitly. + +The proper shutdown order is as follows: +* Tell SCST to stop accepting SCSI commands and wait until all initiators have + logged out: + for x in $(find /sys/kernel/scst_tgt/targets/ -name enabled); do + echo 0 > $x & + done + wait + while ls -Ad /sys/kernel/scst_tgt/targets/*/*/sessions/* >/dev/null 2>&1; do + sleep 1 + done +* Tell SCST to release the DLM lockspaces: + while grep -q '^1$' /sys/kernel/scst_tgt/devices/*/cluster_mode 2>/dev/null + do + for x in /sys/kernel/scst_tgt/devices/*/cluster_mode; do + { [ -e "$x" ] && echo 0 > "$x"; } & + done + wait + sleep 1 + done +* Stop Pacemaker and Corosync +* Unload the SCST kernel modules +* Unload the DLM kernel driver + +Lockspace names +--------------- + +The names of the DLM lockspaces used by SCST follow the following pattern: +scst- where t10_dev_id is the T10 device ID of the SCST device +associated with this lockspace. + +Notes +----- + +Since the lockspace name depends on the t10_dev_id it is not allowed to +change the t10_dev_id if cluster mode has been enabled. + +Testing +------- + +Two examples of test suites for the cluster PR support code are: +* The SCSI conformance tests in the libiscsi project. +* The Windows Cluster Validation Tests + (https://technet.microsoft.com/en-us/library/Cc726064.aspx). + +To do +----- + +Ensure that PREEMPT AND ABORT affects all cluster nodes instead of only the +cluster node that received this command. + +See also +-------- + +* Bart Van Assche, Using the DLM as a distributed in-memory database, Linux + Plumbers North America, Seattle, August 20, 2015 + (https://linuxplumbersconf.org/2015/ocw//system/presentations/2691/original/Using%20the%20DLM%20as%20a%20Distributed%20In-Memory%20Database.pdf). +* Andrew Beekhof, Pacemaker Configuration Explained, 2015 + (http://clusterlabs.org/doc/en-US/Pacemaker/1.1/html/Pacemaker_Explained/). +* Andrew Beekhof, Clusters from Scratch, 2015 + (http://clusterlabs.org/doc/en-US/Pacemaker/1.1-pcs/html/Clusters_from_Scratch/index.html). diff --git a/scst/include/scst.h b/scst/include/scst.h index 8d1940bd2..feb76eff7 100644 --- a/scst/include/scst.h +++ b/scst/include/scst.h @@ -38,6 +38,7 @@ #include #include #include +#include #ifdef CONFIG_SCST_MEASURE_LATENCY #include #endif @@ -1991,6 +1992,18 @@ struct scst_cmd_threads { int scst_set_thr_cpu_mask(struct scst_cmd_threads *cmd_threads, cpumask_t *cpu_mask); +struct scst_pr_dlm_data; + +/* + * DLM lock status block with completion for notifying completion of + * synchronous DLM lock operations. + */ +struct scst_lksb { + struct dlm_lksb lksb; + struct completion compl; + struct scst_pr_dlm_data *pr_dlm; +}; + /* * Used to execute cmd's in order of arrival, honoring SCSI task attributes */ @@ -2545,6 +2558,53 @@ struct scst_dev_registrant { /* 2 auxiliary fields used to rollback changes for errors, etc. */ struct list_head aux_list_entry; __be64 rollback_key; + + /* For registrant information managed via the DLM. */ + int dlm_idx; + struct scst_lksb lksb; + char lvb[PR_DLM_LVB_LEN]; +}; + +/** + * struct scst_cl_ops - Encapsulation of behavior that depends on cluster mode + * @pr_init: Initialize resources needed by one of the functions below. + * @pr_cleanup: Free resources allocated by one of the functions below. + * @pr_is_set: Whether or not one of the registrants holds a reservation. + * @pr_init_reg: Cluster-specific registrant initialization. + * @pr_rm_reg: Cluster-specific registrant cleanup. + * @pr_write_lock: Lock the PR data structures for write access. + * @pr_write_unlock: Unlock the PR data structures for write access. + * @reserved: Whether an initiator holds an SPC-2 reservation. + * @res_lock: Protect the SPC-2 reservation state against concurrent + * modifications. + * @res_unlock: Counterpart of @res_lock. + * @is_rsv_holder: Whether session @sess holds an SPC-2 reservation on @dev. + * @is_not_rsv_holder: Whether another session than @sess holds an SPC-2 + * reservation on @dev. + * @reserve: Apply an SPC-2 reservation for session @sess on @dev if + * @sess != NULL or clear that reservation if @ses == NULL. + */ +struct scst_cl_ops { + int (*pr_init)(struct scst_device *dev, const char *cl_dev_id); + void (*pr_cleanup)(struct scst_device *dev); + bool (*pr_is_set)(struct scst_device *dev); + void (*pr_init_reg)(struct scst_device *dev, + struct scst_dev_registrant *reg); + void (*pr_rm_reg)(struct scst_device *dev, + struct scst_dev_registrant *reg); + void (*pr_write_lock)(struct scst_device *dev, + struct scst_lksb *pr_lksb); + void (*pr_write_unlock)(struct scst_device *dev, + struct scst_lksb *pr_lksb); + + bool (*reserved)(struct scst_device *dev); + void (*res_lock)(struct scst_device *dev, struct scst_lksb *pr_lksb); + void (*res_unlock)(struct scst_device *dev, struct scst_lksb *pr_lksb); + bool (*is_rsv_holder)(struct scst_device *dev, + struct scst_session *sess); + bool (*is_not_rsv_holder)(struct scst_device *dev, + struct scst_session *sess); + void (*reserve)(struct scst_device *dev, struct scst_session *sess); }; /* @@ -2725,6 +2785,9 @@ struct scst_device { /* Set if reserved via the SPC-2 SCSI RESERVE command. */ struct scst_session *reserved_by; + /* Operations that depend on whether or not cluster mode is enabled */ + const struct scst_cl_ops *cl_ops; + /********************************************************************** * Persistent reservation fields. Protected as follows: * - Reading PR data must be protected via scst_pr_read_lock() / @@ -2745,12 +2808,21 @@ struct scst_device { /* Whether or not pr_file_name has been modified via sysfs. */ unsigned int pr_file_name_is_set:1; + /* + * Whether or not the PR state must be synchronized with other cluster + * nodes. + */ + unsigned int cluster_mode:1; + /* Persistent reservation type */ uint8_t pr_type; /* Persistent reservation scope */ uint8_t pr_scope; + /* Data structures for managing PR data via the DLM */ + struct scst_pr_dlm_data *pr_dlm; + /* Mutex to protect PR operations */ struct mutex dev_pr_mutex; @@ -5405,4 +5477,9 @@ void scst_path_put(struct nameidata *nd); #endif int scst_remove_file(const char *name); +int scst_pr_set_cluster_mode(struct scst_device *dev, bool cluster_mode, + const char *cl_dev_id); +int scst_pr_init_dev(struct scst_device *dev); +void scst_pr_clear_dev(struct scst_device *dev); + #endif /* __SCST_H */ diff --git a/scst/include/scst_const.h b/scst/include/scst_const.h index cdb4a4a6a..39a829b58 100644 --- a/scst/include/scst_const.h +++ b/scst/include/scst_const.h @@ -735,4 +735,8 @@ enum { E_TGT_PRIV_NOT_YET_SET = EBUSY }; +/* Size of the lock value block in the DLM PR lockspace */ +#define PR_DLM_LVB_LEN 256 + + #endif /* __SCST_CONST_H */ diff --git a/scst/src/Makefile b/scst/src/Makefile index 1110bbf93..351bb956e 100644 --- a/scst/src/Makefile +++ b/scst/src/Makefile @@ -48,6 +48,13 @@ scst-y += scst_sysfs.o scst-y += scst_mem.o scst-y += scst_debug.o scst-y += scst_pres.o +scst-y += scst_no_dlm.o +ifdef CONFIG_DLM +scst-y += scst_dlm.o +endif +ifdef CONFIG_DLM_MODULE +scst-y += scst_dlm.o +endif scst-y += scst_tg.o obj-$(CONFIG_SCST) += scst.o dev_handlers/ diff --git a/scst/src/dev_handlers/scst_vdisk.c b/scst/src/dev_handlers/scst_vdisk.c index 1287a3769..6a5181061 100644 --- a/scst/src/dev_handlers/scst_vdisk.c +++ b/scst/src/dev_handlers/scst_vdisk.c @@ -396,6 +396,10 @@ static ssize_t vdisk_sysfs_removable_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); static ssize_t vdev_sysfs_filename_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf); +static ssize_t vdev_sysfs_cluster_mode_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf); +static ssize_t vdev_sysfs_cluster_mode_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count); static ssize_t vdisk_sysfs_resync_size_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count); static ssize_t vdisk_sysfs_sync_store(struct kobject *kobj, @@ -483,6 +487,9 @@ static struct kobj_attribute vdisk_removable_attr = __ATTR(removable, S_IRUGO, vdisk_sysfs_removable_show, NULL); static struct kobj_attribute vdisk_filename_attr = __ATTR(filename, S_IRUGO, vdev_sysfs_filename_show, NULL); +static struct kobj_attribute vdisk_cluster_mode_attr = + __ATTR(cluster_mode, S_IWUSR|S_IRUGO, vdev_sysfs_cluster_mode_show, + vdev_sysfs_cluster_mode_store); static struct kobj_attribute vdisk_resync_size_attr = __ATTR(resync_size, S_IWUSR, NULL, vdisk_sysfs_resync_size_store); static struct kobj_attribute vdisk_sync_attr = @@ -540,6 +547,7 @@ static const struct attribute *vdisk_fileio_attrs[] = { &vdisk_o_direct_attr.attr, &vdisk_removable_attr.attr, &vdisk_filename_attr.attr, + &vdisk_cluster_mode_attr.attr, &vdisk_resync_size_attr.attr, &vdisk_sync_attr.attr, &vdev_t10_vend_id_attr.attr, @@ -567,6 +575,7 @@ static const struct attribute *vdisk_blockio_attrs[] = { &vdisk_removable_attr.attr, &vdisk_rotational_attr.attr, &vdisk_filename_attr.attr, + &vdisk_cluster_mode_attr.attr, &vdisk_resync_size_attr.attr, &vdisk_sync_attr.attr, &vdev_t10_vend_id_attr.attr, @@ -680,6 +689,7 @@ static struct scst_dev_type vdisk_file_devtype = { "filename, " "nv_cache, " "o_direct, " + "cluster_mode, " "read_only, " "removable, " "rotational, " @@ -734,6 +744,7 @@ static struct scst_dev_type vdisk_blk_devtype = { "dif_filename, " "filename, " "nv_cache, " + "cluster_mode, " "read_only, " "removable, " "rotational, " @@ -1648,6 +1659,9 @@ next: if (vdev_saved_mode_pages_enabled) vdev_load_mode_pages(virt_dev); + res = scst_pr_set_cluster_mode(dev, dev->cluster_mode, + virt_dev->t10_dev_id); + out: TRACE_EXIT(); return res; @@ -1664,6 +1678,8 @@ static void vdisk_detach(struct scst_device *dev) TRACE_DBG("virt_id %d", dev->virt_id); + scst_pr_set_cluster_mode(dev, false, virt_dev->t10_dev_id); + PRINT_INFO("Detached virtual device %s (\"%s\")", virt_dev->name, vdev_get_filename(virt_dev)); @@ -4437,14 +4453,16 @@ static int vdisk_ctrl_m_pg(unsigned char *p, int pcontrol, */ p[2] |= 7 << 5; /* TST */ #endif - p[2] |= 1 << 2; /* D_SENSE */ - p[2] |= 1 << 3; /* DPICZ */ - p[2] |= 1 << 4; /* TMF_ONLY */ - p[3] |= 0xF << 4; /* QUEUE ALGORITHM MODIFIER */ - p[3] |= 3 << 1; /* QErr */ - p[4] |= 1 << 3; /* SWP */ - p[5] |= 1 << 6; /* TAS */ - p[5] |= 0 << 7; /* ATO */ + if (!virt_dev->dev->cluster_mode) { + p[2] |= 1 << 2; /* D_SENSE */ + p[2] |= 1 << 3; /* DPICZ */ + p[2] |= 1 << 4; /* TMF_ONLY */ + p[3] |= 0xF << 4; /* QUEUE ALGORITHM MODIFIER */ + p[3] |= 3 << 1; /* QErr */ + p[4] |= 1 << 3; /* SWP */ + p[5] |= 1 << 6; /* TAS */ + p[5] |= 0 << 7; /* ATO */ + } break; case 2: /* default */ p[2] |= virt_dev->tst << 5; @@ -4920,6 +4938,13 @@ static enum compl_status_e vdisk_exec_mode_select(struct vdisk_cmd_params *p) TRACE_ENTRY(); virt_dev = cmd->dev->dh_priv; + if (cmd->dev->cluster_mode) { + PRINT_ERROR("MODE SELECT: not supported in cluster mode\n"); + scst_set_cmd_error(cmd, + SCST_LOAD_SENSE(scst_sense_invalid_field_in_cdb)); + goto out; + } + mselect_6 = (MODE_SELECT == cmd->cdb[0]); type = cmd->dev->type; @@ -8810,6 +8835,104 @@ out: return res; } +static ssize_t vdev_sysfs_cluster_mode_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct scst_device *dev = container_of(kobj, struct scst_device, + dev_kobj); + + return sprintf(buf, "%d\n%s", dev->cluster_mode, + dev->cluster_mode ? + SCST_SYSFS_KEY_MARK "\n" : ""); +} + +static int vdev_sysfs_process_cluster_mode_store( + struct scst_sysfs_work_item *work) +{ + struct scst_device *dev = work->dev; + struct scst_vdisk_dev *virt_dev; + long clm; + int res; + + res = scst_suspend_activity(SCST_SUSPEND_TIMEOUT_USER); + if (res) + goto out; + + res = mutex_lock_interruptible(&scst_mutex); + if (res) + goto resume; + + /* + * This is safe since we hold a reference on dev_kobj and since + * scst_assign_dev_handler() waits until all dev_kobj references + * have been dropped before invoking .detach(). + */ + virt_dev = dev->dh_priv; +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 39) + res = kstrtol(work->buf, 0, &clm); +#else + res = strict_strtol(work->buf, 0, &clm); +#endif + if (res) + goto unlock; + res = -EINVAL; + if (clm < 0 || clm > 1) + goto unlock; + if (clm != dev->cluster_mode) { + res = scst_pr_set_cluster_mode(dev, clm, virt_dev->t10_dev_id); + if (res) + goto unlock; + dev->cluster_mode = clm; + } else { + res = 0; + } + +unlock: + mutex_unlock(&scst_mutex); + +resume: + scst_resume_activity(); + +out: + kobject_put(&dev->dev_kobj); + + return res; +} + +static ssize_t vdev_sysfs_cluster_mode_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct scst_device *dev = container_of(kobj, struct scst_device, + dev_kobj); + struct scst_sysfs_work_item *work; + char *arg; + int res; + + TRACE_ENTRY(); + + res = -ENOMEM; + arg = kasprintf(GFP_KERNEL, "%.*s", (int)count, buf); + if (!arg) + goto out; + + res = scst_alloc_sysfs_work(vdev_sysfs_process_cluster_mode_store, + false, &work); + if (res) + goto out; + work->dev = dev; + swap(work->buf, arg); + kobject_get(&dev->dev_kobj); + res = scst_sysfs_queue_wait_work(work); + if (res) + goto out; + res = count; + +out: + kfree(arg); + TRACE_EXIT_RES(res); + return res; +} + static int vdisk_sysfs_process_resync_size_store( struct scst_sysfs_work_item *work) { @@ -9165,6 +9288,10 @@ static ssize_t vdev_sysfs_t10_dev_id_store(struct kobject *kobj, dev = container_of(kobj, struct scst_device, dev_kobj); virt_dev = dev->dh_priv; + res = -EPERM; + if (dev->cluster_mode) + goto out; + write_lock(&vdisk_serial_rwlock); if ((count > sizeof(virt_dev->t10_dev_id)) || @@ -9198,6 +9325,7 @@ static ssize_t vdev_sysfs_t10_dev_id_store(struct kobject *kobj, out_unlock: write_unlock(&vdisk_serial_rwlock); +out: TRACE_EXIT_RES(res); return res; } diff --git a/scst/src/scst_dlm.c b/scst/src/scst_dlm.c new file mode 100644 index 000000000..369c98c24 --- /dev/null +++ b/scst/src/scst_dlm.c @@ -0,0 +1,1458 @@ +/* + * Copyright (c) 2013 - 2014 Fusion-io, Inc. All rights reserved. + * Copyright (C) 2014 - 2015 SanDisk Corporation. + * + * Synchronization of persistent registration data with DLM lock value blocks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2 + * of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#ifdef INSIDE_KERNEL_TREE +#include +#include +#else +#include "scst.h" +#include "scst_const.h" +#endif +#include "scst_priv.h" +#include "scst_pres.h" +#include "scst_dlm.h" + +static void scst_pr_dlm_cleanup(struct scst_device *dev); +static void scst_dlm_pre_bast(void *bastarg, int mode); +static void scst_dlm_post_bast(void *bastarg, int mode); +static void scst_dlm_post_ast(void *astarg); + +static inline void compile_time_size_checks(void) +{ + BUILD_BUG_ON(sizeof(struct pr_lvb) > PR_DLM_LVB_LEN); + BUILD_BUG_ON(sizeof(struct pr_lvb) != 20); + BUILD_BUG_ON(sizeof(struct pr_reg_lvb) > PR_DLM_LVB_LEN); + BUILD_BUG_ON(sizeof(struct pr_reg_lvb) != 240); +} + +static void scst_dlm_ast(void *astarg) +{ + struct scst_lksb *scst_lksb = astarg; + + complete(&scst_lksb->compl); +} + +/** + * scst_dlm_cancel - Synchronously cancel a pending dlm_lock() operation + */ +static int scst_dlm_cancel(dlm_lockspace_t *ls, struct scst_lksb *lksb, + int flags, const char *name) +{ + int res; + + res = dlm_unlock(ls, lksb->lksb.sb_lkid, + DLM_LKF_CANCEL | (flags & DLM_LKF_VALBLK), + &lksb->lksb, lksb); + if (res < 0) + goto out; + res = wait_for_completion_timeout(&lksb->compl, 10 * HZ); + +out: + return res; +} + +/** + * scst_dlm_lock_wait - Wait until a DLM lock has been granted + * @ls: DLM lock space. + * @mode: DLM lock mode. + * @lksb: DLM lock status block. + * @flags: DLM flags. + * @name: DLM lock name. Only required for non-conversion requests. + * @bast: AST to be invoked in case this lock blocks another one. + */ +static int scst_dlm_lock_wait(dlm_lockspace_t *ls, int mode, + struct scst_lksb *lksb, int flags, + const char *name, void (*bast)(void *, int)) +{ + int res; + + init_completion(&lksb->compl); + res = dlm_lock(ls, mode, &lksb->lksb, flags, + (void *)name, name ? strlen(name) : 0, 0, + scst_dlm_ast, lksb, bast); + if (res < 0) + goto out; + res = wait_for_completion_timeout(&lksb->compl, 60 * HZ); + if (res > 0) + res = lksb->lksb.sb_status; + else if (res == 0) + res = -ETIMEDOUT; + if (res < 0) { + int res2 = scst_dlm_cancel(ls, lksb, flags, name); + + WARN(res2 < 0, "canceling lock %s / %08x failed: %d\n", + name ? : "?", lksb->lksb.sb_lkid, res2); + } + +out: + return res; +} + +/** + * scst_dlm_unlock_wait - Discard a DLM lock + */ +static int scst_dlm_unlock_wait(dlm_lockspace_t *ls, struct scst_lksb *lksb) +{ + int res; + + sBUG_ON(!ls); + + init_completion(&lksb->compl); + res = dlm_unlock(ls, lksb->lksb.sb_lkid, 0, &lksb->lksb, lksb); + if (res < 0) + goto out; + res = wait_for_completion_timeout(&lksb->compl, 60 * HZ); + if (res > 0) { + res = lksb->lksb.sb_status; + if (res == -DLM_EUNLOCK || res == -DLM_ECANCEL) + res = 0; + } else if (res == 0) { + res = -ETIMEDOUT; + } + +out: + return res; +} + +/* Number of persistent reservation registrants. */ +static uint32_t scst_pr_num_regs(struct scst_device *dev) +{ + struct scst_dev_registrant *reg; + uint32_t num_regs = 0; + + lockdep_assert_pr_read_lock_held(dev); + + list_for_each_entry(reg, &dev->dev_registrants_list, + dev_registrants_list_entry) + num_regs++; + + return num_regs; +} + +/* DLM-specific registrant initialization. */ +static void scst_dlm_pr_init_reg(struct scst_device *dev, + struct scst_dev_registrant *reg) +{ + reg->lksb.lksb.sb_lvbptr = (void *)reg->lvb; + reg->lksb.lksb.sb_lkid = 0; + reg->dlm_idx = -1; +} + +static void scst_dlm_pr_rm_reg_ls(dlm_lockspace_t *ls, + struct scst_dev_registrant *reg) +{ + int res; + + if (!reg->lksb.lksb.sb_lkid) + return; + res = scst_dlm_unlock_wait(ls, ®->lksb); + WARN(res < 0, "scst_dlm_unlock_wait(%08x) failed (%d)", + reg->lksb.lksb.sb_lkid, res); + reg->lksb.lksb.sb_lkid = 0; + reg->dlm_idx = -1; +} + +/* DLM-specific registrant cleanup. */ +static void scst_dlm_pr_rm_reg(struct scst_device *dev, + struct scst_dev_registrant *reg) +{ + lockdep_assert_pr_write_lock_held(dev); + scst_dlm_pr_rm_reg_ls(dev->pr_dlm->ls, reg); +} + +/* Copy SPC-2 reservation state from the DLM LVB into @dev. */ +static bool scst_copy_res_from_dlm(struct scst_device *dev, struct pr_lvb *lvb) +{ + struct scst_pr_dlm_data *const pr_dlm = dev->pr_dlm; + struct scst_session *dropped_res = NULL; + bool modified_lvb = false; + + spin_lock_bh(&dev->dev_lock); + pr_dlm->reserved_by_nodeid = be32_to_cpu(lvb->reserved_by_nodeid); + if (dev->reserved_by && + pr_dlm->reserved_by_nodeid != pr_dlm->local_nodeid) { + PRINT_WARNING("%s: dropping SPC-2 reservation for %s (due to" + " split-brain ?) because node %d holds a" + " reservation", dev->virt_name, + dev->reserved_by->initiator_name, + pr_dlm->reserved_by_nodeid); + swap(dev->reserved_by, dropped_res); + } + if (!dev->reserved_by && + pr_dlm->reserved_by_nodeid == pr_dlm->local_nodeid) { + PRINT_WARNING("%s: dropping SPC-2 reservation (due to restart" + " or split-brain ?) and triggering LVB update" + " because of inconstency (holder %d / not rsrvd)", + dev->virt_name, pr_dlm->reserved_by_nodeid); + pr_dlm->reserved_by_nodeid = 0; + lvb->reserved_by_nodeid = 0; + modified_lvb = true; + } + if (dev->reserved_by) + EXTRACHECKS_BUG_ON(pr_dlm->reserved_by_nodeid != + pr_dlm->local_nodeid); + else + EXTRACHECKS_BUG_ON(pr_dlm->reserved_by_nodeid == + pr_dlm->local_nodeid); + if (dropped_res) + scst_sess_get(dropped_res); + spin_unlock_bh(&dev->dev_lock); + + if (dropped_res) { + /* + * To do: something like + * scst_do_nexus_loss_sess(dropped_res, true); + */ + scst_sess_put(dropped_res); + } + + return modified_lvb; +} + +/* + * Update local PR and registrant information from the content of the DLM LVB's. + * Caller must hold PR_DATA_LOCK in PW mode. + * + * Returns -EINVAL if and only if an invalid lock value block has been + * encountered. + */ +static int scst_copy_from_dlm(struct scst_device *dev, dlm_lockspace_t *ls, + bool *modified_lvb) +{ + struct scst_pr_dlm_data *const pr_dlm = dev->pr_dlm; + struct pr_lvb *lvb = (void *)pr_dlm->lvb; + struct scst_lksb *reg_lksb = NULL; + struct scst_dev_registrant *reg, *tmp_reg; + int i, res = -ENOMEM; + uint32_t nr_registrants; + void *reg_lvb_content = NULL; + + lockdep_assert_held(&pr_dlm->ls_mutex); + + nr_registrants = be32_to_cpu(lvb->nr_registrants); + if (nr_registrants) { + reg_lksb = vzalloc((sizeof(*reg_lksb) + PR_DLM_LVB_LEN) * + nr_registrants); + if (!reg_lksb) { + PRINT_ERROR("%s: failed to allocate %d * %zd bytes of" + " memory", __func__, nr_registrants, + sizeof(*reg_lksb) + PR_DLM_LVB_LEN); + goto out; + } + reg_lvb_content = (void *)reg_lksb + + nr_registrants * sizeof(*reg_lksb); + } + + for (i = 0; i < nr_registrants; i++) { + char reg_name[32]; + struct pr_reg_lvb *reg_lvb; + + snprintf(reg_name, sizeof(reg_name), PR_REG_LOCK, i); + reg_lvb = reg_lvb_content + i * PR_DLM_LVB_LEN; + reg_lksb[i].lksb.sb_lvbptr = (void *)reg_lvb; + res = scst_dlm_lock_wait(ls, DLM_LOCK_PW, ®_lksb[i], + DLM_LKF_VALBLK, reg_name, NULL); + if (res < 0) { + res = -EFAULT; + PRINT_ERROR("locking %s.%s failed", dev->virt_name, + reg_name); + goto cancel; + } else if (reg_lksb[i].lksb.sb_flags & DLM_SBF_VALNOTVALID) { + res = -EINVAL; + PRINT_WARNING("%s.%s has an invalid lock value block", + dev->virt_name, reg_name); + goto cancel; + } else if (reg_lvb->version != 1) { + res = -EPROTONOSUPPORT; + PRINT_ERROR("%s.%s.version = %d instead of 1", + dev->virt_name, reg_name, + reg_lvb->version); + goto cancel; + } + } + + *modified_lvb = scst_copy_res_from_dlm(dev, lvb); + + scst_pr_write_lock(dev); + + dev->pr_aptpl = lvb->pr_aptpl; + dev->pr_generation = be32_to_cpu(lvb->pr_generation); + dev->pr_is_set = lvb->pr_is_set; + dev->pr_type = lvb->pr_type; + dev->pr_scope = lvb->pr_scope; + dev->pr_holder = NULL; + + list_for_each_entry(reg, &dev->dev_registrants_list, + dev_registrants_list_entry) + scst_dlm_pr_rm_reg_ls(ls, reg); + + for (i = 0; i < nr_registrants; i++) { + struct pr_reg_lvb *reg_lvb; + uint16_t rel_tgt_id; + + reg_lvb = (struct pr_reg_lvb *)reg_lksb[i].lksb.sb_lvbptr; + rel_tgt_id = be16_to_cpu(reg_lvb->rel_tgt_id); +#if 0 + PRINT_INFO("Transport ID in %s." PR_REG_LOCK " (len %d):", + dev->virt_name, i, scst_tid_size(reg_lvb->tid)); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 16, 1, + reg_lvb->tid, scst_tid_size(reg_lvb->tid), 1); +#endif + reg = scst_pr_find_reg(dev, reg_lvb->tid, rel_tgt_id); + if (reg && reg->key != reg_lvb->key) { + scst_pr_remove_registrant(dev, reg); + reg = NULL; + } + if (!reg) + reg = scst_pr_add_registrant(dev, reg_lvb->tid, + rel_tgt_id, reg_lvb->key, + false); + if (reg) { + scst_dlm_pr_rm_reg_ls(ls, reg); + reg->lksb.lksb.sb_lkid = reg_lksb[i].lksb.sb_lkid; + reg->dlm_idx = i; + memcpy(reg->lvb, reg_lvb_content, sizeof(reg->lvb)); + if (reg_lvb->is_holder) { + if (dev->pr_is_set) + scst_pr_clear_holder(dev); + scst_pr_set_holder(dev, reg, lvb->pr_scope, + lvb->pr_type); + } + } else { + PRINT_ERROR("pr_add_registrant %s." PR_REG_LOCK + " failed\n", dev->virt_name, i); + scst_dlm_unlock_wait(ls, ®_lksb[i]); + continue; + } + scst_dlm_lock_wait(ls, DLM_LOCK_CR, ®->lksb, + DLM_LKF_CONVERT | DLM_LKF_VALBLK, NULL, + NULL); + } + + /* Remove all registrants not found in any DLM LVB */ + list_for_each_entry_safe(reg, tmp_reg, &dev->dev_registrants_list, + dev_registrants_list_entry) + if (reg->lksb.lksb.sb_lkid == 0) + scst_pr_remove_registrant(dev, reg); + + scst_pr_write_unlock(dev); + + res = 0; + +out: + vfree(reg_lksb); + return res; + +cancel: + for (i = 0; i < nr_registrants; i++) + if (reg_lksb[i].lksb.sb_lkid) + scst_dlm_unlock_wait(ls, ®_lksb[i]); + + goto out; +} + +static struct scst_dev_registrant* +scst_get_reg_by_dlm_idx(struct scst_device *dev, int i) +{ + struct scst_dev_registrant *reg; + + lockdep_assert_pr_read_lock_held(dev); + + list_for_each_entry(reg, &dev->dev_registrants_list, + dev_registrants_list_entry) + if (reg->dlm_idx == i) + return reg; + + return NULL; +} + +static int scst_get_available_dlm_idx(struct scst_device *dev) +{ + int i = 0; + + lockdep_assert_pr_read_lock_held(dev); + + while (scst_get_reg_by_dlm_idx(dev, i)) + i++; + + return i; +} + +/* Copy SPC-2 reservation state for @dev into the DLM LVB @lvb. */ +static void scst_copy_res_to_dlm(struct scst_device *dev, struct pr_lvb *lvb) +{ + struct scst_pr_dlm_data *const pr_dlm = dev->pr_dlm; + + spin_lock_bh(&dev->dev_lock); + lvb->reserved_by_nodeid = cpu_to_be32(pr_dlm->reserved_by_nodeid); + spin_unlock_bh(&dev->dev_lock); +} + +/* + * Update PR and registrant information in the DLM LVB's. Caller must hold + * PR_DATA_LOCK in PW mode. + */ +static void scst_copy_to_dlm(struct scst_device *dev, dlm_lockspace_t *ls) +{ + struct scst_pr_dlm_data *const pr_dlm = dev->pr_dlm; + struct pr_lvb *lvb = (void *)pr_dlm->lvb; + struct pr_reg_lvb *reg_lvb; + struct scst_dev_registrant *reg; + int i, tid_size; + char reg_name[32]; + uint32_t nr_registrants; + + lockdep_assert_held(&pr_dlm->ls_mutex); + + scst_copy_res_to_dlm(dev, lvb); + + scst_pr_write_lock(dev); + + nr_registrants = scst_pr_num_regs(dev); + lvb->version = 1; + lvb->pr_is_set = dev->pr_is_set; + lvb->pr_type = dev->pr_type; + lvb->pr_scope = dev->pr_scope; + lvb->pr_aptpl = dev->pr_aptpl; + lvb->nr_registrants = cpu_to_be32(nr_registrants); + lvb->pr_generation = cpu_to_be32(dev->pr_generation); + + list_for_each_entry(reg, &dev->dev_registrants_list, + dev_registrants_list_entry) { + if (reg->dlm_idx >= nr_registrants) + scst_dlm_pr_rm_reg_ls(ls, reg); + if (reg->dlm_idx < 0) { + i = scst_get_available_dlm_idx(dev); + snprintf(reg_name, sizeof(reg_name), PR_REG_LOCK, i); + if (scst_dlm_lock_wait(ls, DLM_LOCK_NL, + ®->lksb, 0, reg_name, NULL) + >= 0) + reg->dlm_idx = i; + } + } + + list_for_each_entry(reg, &dev->dev_registrants_list, + dev_registrants_list_entry) { + if (WARN_ON(!reg->lksb.lksb.sb_lkid)) + continue; + snprintf(reg_name, sizeof(reg_name), PR_REG_LOCK, reg->dlm_idx); + if (scst_dlm_lock_wait(ls, DLM_LOCK_PW, ®->lksb, + DLM_LKF_VALBLK | DLM_LKF_CONVERT, + reg_name, NULL) >= 0) { + reg_lvb = (void *)reg->lksb.lksb.sb_lvbptr; + memset(reg->lvb, 0, sizeof(reg->lvb)); + reg_lvb->key = reg->key; + reg_lvb->rel_tgt_id = cpu_to_be16(reg->rel_tgt_id); + reg_lvb->version = 1; + reg_lvb->is_holder = dev->pr_holder == reg; + tid_size = scst_tid_size(reg->transport_id); +#if 0 + PRINT_INFO("Copying transport ID into %s." PR_REG_LOCK + " (len %d)", dev->virt_name, reg->dlm_idx, + tid_size); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 16, + 1, reg->transport_id, tid_size, 1); +#endif + if (WARN(tid_size > sizeof(reg_lvb->tid), + "tid_size %d > %zd\n", tid_size, + sizeof(reg_lvb->tid))) + tid_size = sizeof(reg_lvb->tid); + memcpy(reg_lvb->tid, reg->transport_id, tid_size); + scst_dlm_lock_wait(ls, DLM_LOCK_CR, ®->lksb, + DLM_LKF_CONVERT | DLM_LKF_VALBLK, + reg_name, NULL); + } else { + PRINT_ERROR("Failed to lock %s.%s", dev->virt_name, + reg_name); + } + } + + scst_pr_write_unlock(dev); +} + +/* + * Read the contents of a file, copy it into a buffer and terminate the buffer + * with '\0'. + */ +static int scst_read_file(const char *path, char *buf, int buf_len) +{ + struct file *f; + loff_t pos; + int ret; + + f = filp_open(path, 0, 0400); + if (IS_ERR(f)) { + ret = PTR_ERR(f); + goto out; + } + pos = 0; + ret = vfs_read(f, (char __force __user *)buf, buf_len, &pos); + if (ret >= 0) + buf[min(ret, buf_len - 1)] = '\0'; + filp_close(f, NULL); +out: + return ret; +} + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 11, 0) +struct scst_dlm_readdir_context { + struct dir_context ctx; + char *entries; +}; +#endif + +/* Append the name of each directory entry to the buffer @arg points to. */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0) +static int scst_dlm_filldir(void *arg, const char *name_arg, int name_len, + loff_t curr_pos, u64 inode, unsigned dtype) +#else +static int scst_dlm_filldir(struct dir_context *arg, const char *name_arg, + int name_len, loff_t curr_pos, u64 inode, + unsigned dtype) +#endif +{ + char *p, *q, name[64]; +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) + char **entries = arg; +#else +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0) + struct scst_dlm_readdir_context *ctx = + container_of((struct dir_context *)arg, typeof(*ctx), ctx); +#else + struct scst_dlm_readdir_context *ctx = + container_of(arg, typeof(*ctx), ctx); +#endif + char **entries = &ctx->entries; +#endif + int i; + + snprintf(name, sizeof(name), "%.*s", name_len, name_arg); + if (strcmp(name, ".") == 0 || strcmp(name, "..") == 0 || !*entries) + goto out; + for (p = *entries; *p; p += strlen(p) + 1) + ; + i = p - *entries; + q = *entries; + *entries = krealloc(q, i + strlen(name) + 2, GFP_KERNEL); + if (!*entries) { + kfree(q); + goto out; + } + strcpy(*entries + i, name); + i += strlen(name); + (*entries)[i + 1] = '\0'; + +out: + return *entries ? 0 : -ENOMEM; +} + +/** + * scst_dlm_update_nodeids - Update the Corosync node ID array pr_dlm->nodeid[] + */ +static int scst_dlm_update_nodeids(struct scst_pr_dlm_data *pr_dlm) +{ + static const char comms_dir[] = "/sys/kernel/config/dlm/cluster/comms"; + struct file *comms; + char *p, *entries = kzalloc(1, GFP_KERNEL); + uint32_t nodeid, *new; + int i, ret, num_nodes; + char path[256], buf[64]; + + lockdep_assert_held(&pr_dlm->ls_mutex); + + num_nodes = 0; + + comms = filp_open(comms_dir, 0, 0400); + if (IS_ERR(comms)) { + ret = PTR_ERR(comms); + goto out; + } +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 11, 0) + ret = vfs_readdir(comms, scst_dlm_filldir, &entries); +#else + { + struct scst_dlm_readdir_context ctx = { + .ctx = { + .actor = scst_dlm_filldir, + }, + .entries = entries, + }; + ret = iterate_dir(comms, &ctx.ctx); + entries = ctx.entries; + } +#endif + filp_close(comms, NULL); + ret = -ENOMEM; + if (!entries) + goto out; + for (p = entries; *p; p += strlen(p) + 1) + num_nodes++; + new = krealloc(pr_dlm->nodeid, sizeof(*pr_dlm->nodeid) * num_nodes, + GFP_KERNEL); + if (!new) + goto out; + pr_dlm->nodeid = new; + pr_dlm->participants = num_nodes; + for (i = 0, p = entries; *p; i++, p += strlen(p) + 1) { + nodeid = simple_strtoul(p, NULL, 0); + snprintf(path, sizeof(path), "%s/%s/local", comms_dir, p); + if (scst_read_file(path, buf, sizeof(buf)) >= 0 && + strcmp(buf, "1\n") == 0) + pr_dlm->local_nodeid = nodeid; + pr_dlm->nodeid[i] = nodeid; + } + ret = 0; + +out: + kfree(entries); + return ret; +} + +/* + * Toggle all non-local DLM locks with name format @fmt from NL to PR and back + * to NL. + */ +static void scst_pr_toggle_lock(struct scst_pr_dlm_data *pr_dlm, + dlm_lockspace_t *ls, const char *fmt) +{ + struct scst_lksb lksb; + int i, res; + char lock_name[32]; + + memset(&lksb, 0, sizeof(lksb)); + for (i = 0; i < pr_dlm->participants; i++) { + if (pr_dlm->nodeid[i] == pr_dlm->local_nodeid) + continue; + snprintf(lock_name, sizeof(lock_name), fmt, pr_dlm->nodeid[i]); + lksb.lksb.sb_lkid = 0; + res = scst_dlm_lock_wait(ls, DLM_LOCK_PR, &lksb, 0, + lock_name, NULL); + if (res < 0) + PRINT_WARNING("Locking %s.%s failed (%d)", + pr_dlm->dev->virt_name, lock_name, res); + if (!lksb.lksb.sb_lkid) + continue; + scst_dlm_lock_wait(ls, DLM_LOCK_NL, &lksb, + DLM_LKF_CONVERT, lock_name, NULL); + scst_dlm_unlock_wait(ls, &lksb); + } +} + +/* Remove a lock from the local DLM lockspace instance. */ +static void scst_dlm_remove_lock(dlm_lockspace_t *ls, struct scst_lksb *lksb, + const char *name) +{ + if (!lksb->lksb.sb_lkid) + return; + scst_dlm_lock_wait(ls, DLM_LOCK_NL, lksb, DLM_LKF_CONVERT, name, + NULL); + scst_dlm_unlock_wait(ls, lksb); + lksb->lksb.sb_lkid = 0; +} + +static void scst_dlm_remove_locks(struct scst_pr_dlm_data *pr_dlm, + dlm_lockspace_t *ls) +{ + struct scst_device *dev = pr_dlm->dev; + struct scst_dev_registrant *reg; + + lockdep_assert_held(&pr_dlm->ls_mutex); + + scst_pr_write_lock(dev); + list_for_each_entry(reg, &dev->dev_registrants_list, + dev_registrants_list_entry) + scst_dlm_pr_rm_reg_ls(ls, reg); + scst_pr_write_unlock(dev); + + scst_dlm_remove_lock(ls, &pr_dlm->pre_join_lksb, NULL); + scst_dlm_remove_lock(ls, &pr_dlm->post_join_lksb, NULL); + scst_dlm_remove_lock(ls, &pr_dlm->pre_upd_lksb, NULL); + scst_dlm_remove_lock(ls, &pr_dlm->post_upd_lksb, NULL); + scst_dlm_remove_lock(ls, &pr_dlm->data_lksb, PR_DATA_LOCK); +} + +/* + * If two or more nodes are present in the cluster, tell each other node to + * update the local state information from the DLM lock value blocks. The + * caller must hold PR_LOCK in EX mode. + */ +static void scst_trigger_reread_lvb(struct scst_pr_dlm_data *const pr_dlm, + dlm_lockspace_t *ls) +{ + scst_pr_toggle_lock(pr_dlm, ls, PR_POST_UPDATE_LOCK); + scst_pr_toggle_lock(pr_dlm, ls, PR_PRE_UPDATE_LOCK); + scst_pr_toggle_lock(pr_dlm, ls, PR_POST_UPDATE_LOCK); +} + +/* + * If two or more nodes are present in the cluster, tell each other node to + * refresh the DLM lock value blocks. The caller must hold PR_LOCK in EX mode. + */ +static void scst_trigger_lvb_update(struct scst_pr_dlm_data *const pr_dlm, + dlm_lockspace_t *ls) +{ + PRINT_INFO("%s: about to trigger an LVB update", + pr_dlm->dev->virt_name); + scst_pr_toggle_lock(pr_dlm, ls, PR_POST_JOIN_LOCK); + scst_pr_toggle_lock(pr_dlm, ls, PR_PRE_JOIN_LOCK); + scst_pr_toggle_lock(pr_dlm, ls, PR_POST_JOIN_LOCK); + PRINT_INFO("%s: finished triggering an LVB update", + pr_dlm->dev->virt_name); +} + +static void dump_lockspace(const char *cl_dev_id) +{ + char *argv0 = kstrdup("/bin/bash", GFP_KERNEL); + char *argv1 = kstrdup("-c", GFP_KERNEL); + char *argv2 = kasprintf(GFP_KERNEL, + "{ echo lockspace-dump-start;" + " grep -aH '' /sys/kernel/debug/dlm/%s%s*;" + " echo lockspace-dump-end; } 2>&1 |" + " while read line; do logger \"$line\"; done", + SCST_DLM_LOCKSPACE_PFX, cl_dev_id); + char *argv[] = { argv0, argv1, argv2, NULL }; + char *envp[] = { + kstrdup("PATH=/usr/bin:/bin:/usr/sbin:/sbin", GFP_KERNEL), + NULL + }; + + + if (!argv[0] || !argv[1] || !argv[2] || !envp[0]) { + PRINT_ERROR("%s: out of memory", __func__); + goto out; + } + + PRINT_INFO("Invoking %s", argv2); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 10, 0) + call_usermodehelper(argv0, argv, envp, UMH_WAIT_PROC); +#else + call_usermodehelper_fns(argv0, argv, envp, UMH_WAIT_PROC, NULL, NULL, + NULL); +#endif + +out: + kfree(envp[0]); + kfree(argv[2]); + kfree(argv[1]); + kfree(argv[0]); +} + +static void release_lockspace(dlm_lockspace_t *ls, const char *cl_dev_id) +{ + int res; + + res = dlm_release_lockspace(ls, 1); + if (res) { + PRINT_ERROR("releasing lockspace for %s failed: %d", + cl_dev_id, res); + dump_lockspace(cl_dev_id); + } + if (res == -EBUSY) { + /* + * Releasing a lockspace fails if one or more local instances + * of DLM locks still exist in the lockspace. If that + * happens try to release the lockspace forcibly. + */ + res = dlm_release_lockspace(ls, 2); + if (res) + PRINT_ERROR("forcibly releasing lockspace for %s" + " failed: %d", cl_dev_id, res); + } + if (res == 0) + PRINT_INFO("released lockspace for %s", cl_dev_id); +} + +/* Initialize DLM lockspace. */ +static dlm_lockspace_t *get_lockspace(struct scst_device *dev) +{ + struct scst_pr_dlm_data *const pr_dlm = dev->pr_dlm; + dlm_lockspace_t *ls; + struct scst_lksb pr_lksb; + struct pr_lvb *lvb = (void *)pr_dlm->lvb; + char lsp_name[32], lock_name[32]; + int res; + bool modified_lvb = false; + + if (pr_dlm->ls || !pr_dlm->cl_dev_id || in_interrupt() || + time_is_after_jiffies(pr_dlm->latest_lscr_attempt + 1 * HZ)) + goto out; + + mutex_lock(&pr_dlm->ls_cr_mutex); + if (pr_dlm->ls) + goto out_unlock_ls_cr; + + pr_dlm->latest_lscr_attempt = jiffies; + + mutex_lock(&pr_dlm->ls_mutex); + + res = scst_dlm_update_nodeids(pr_dlm); + if (res < 0) { + PRINT_ERROR("scst_dlm_update_nodeids(%s) failed: %d", + dev->virt_name, res); + goto out_unlock_ls; + } + if (pr_dlm->participants == 0) + goto out_unlock_ls; + + snprintf(lsp_name, sizeof(lsp_name), "%s%s", SCST_DLM_LOCKSPACE_PFX, + pr_dlm->cl_dev_id); + res = scst_dlm_new_lockspace(lsp_name, strlen(lsp_name), &ls, + DLM_LSFL_NEWEXCL | DLM_LSFL_FS, + PR_DLM_LVB_LEN); + if (res) { + PRINT_ERROR("Creating DLM lockspace %s failed: %d", lsp_name, + res); + goto out_unlock_ls; + } + + PRINT_INFO("Created DLM lockspace %s for %s", lsp_name, dev->virt_name); + + memset(&pr_lksb, 0, sizeof(pr_lksb)); + res = scst_dlm_lock_wait(ls, DLM_LOCK_EX, &pr_lksb, 0, PR_LOCK, + NULL); + if (res < 0) + goto unlock_dlm_pr; + + snprintf(lock_name, sizeof(lock_name), PR_POST_JOIN_LOCK, + pr_dlm->local_nodeid); + pr_dlm->post_join_lksb.pr_dlm = pr_dlm; + res = scst_dlm_lock_wait(ls, DLM_LOCK_NL, + &pr_dlm->post_join_lksb, 0, lock_name, + scst_dlm_post_bast); + if (res < 0) + goto release_lockspace; + + snprintf(lock_name, sizeof(lock_name), PR_PRE_JOIN_LOCK, + pr_dlm->local_nodeid); + pr_dlm->pre_join_lksb.pr_dlm = pr_dlm; + res = scst_dlm_lock_wait(ls, DLM_LOCK_EX, + &pr_dlm->pre_join_lksb, 0, lock_name, + scst_dlm_pre_bast); + if (res < 0) + goto release_lockspace; + + res = scst_dlm_lock_wait(ls, DLM_LOCK_PW, &pr_dlm->data_lksb, + DLM_LKF_VALBLK, PR_DATA_LOCK, NULL); + if (res < 0) + goto release_lockspace; + + if (pr_dlm->data_lksb.lksb.sb_status & DLM_SBF_VALNOTVALID) { + PRINT_ERROR("%s.%s lock value block not valid", dev->virt_name, + PR_DATA_LOCK); + memset(pr_dlm->lvb, 0, sizeof(pr_dlm->lvb)); + } + + snprintf(lock_name, sizeof(lock_name), PR_POST_UPDATE_LOCK, + pr_dlm->local_nodeid); + pr_dlm->post_upd_lksb.pr_dlm = pr_dlm; + res = scst_dlm_lock_wait(ls, DLM_LOCK_NL, + &pr_dlm->post_upd_lksb, 0, lock_name, + scst_dlm_post_bast); + if (res < 0) + goto release_lockspace; + + snprintf(lock_name, sizeof(lock_name), PR_PRE_UPDATE_LOCK, + pr_dlm->local_nodeid); + pr_dlm->pre_upd_lksb.pr_dlm = pr_dlm; + res = scst_dlm_lock_wait(ls, DLM_LOCK_EX, &pr_dlm->pre_upd_lksb, + 0, lock_name, scst_dlm_pre_bast); + if (res < 0) + goto release_lockspace; + + switch (lvb->version) { + case 0: + scst_copy_to_dlm(dev, ls); + break; + case 1: + res = scst_copy_from_dlm(dev, ls, &modified_lvb); + break; + default: + PRINT_ERROR("%s: Wrong PR LVB version %d", dev->virt_name, + lvb->version); + goto release_lockspace; + } + + scst_dlm_lock_wait(ls, DLM_LOCK_CR, &pr_dlm->data_lksb, + DLM_LKF_CONVERT | DLM_LKF_VALBLK, PR_DATA_LOCK, + NULL); + + if (res == -EINVAL) + scst_trigger_lvb_update(pr_dlm, ls); + else if (modified_lvb) + scst_trigger_reread_lvb(pr_dlm, ls); + + scst_dlm_unlock_wait(ls, &pr_lksb); + + /* + * Only store the lockspace pointer in pr_dlm->ls after the lockspace + * has been fully initialized. Storing it earlier would create a risk + * that a concurrent get_lockspace() call returns a pointer to the + * lockspace that is under creation. + */ + pr_dlm->ls = ls; + +out_unlock_ls: + mutex_unlock(&pr_dlm->ls_mutex); + +out_unlock_ls_cr: + mutex_unlock(&pr_dlm->ls_cr_mutex); + +out: + return pr_dlm->ls; + +release_lockspace: + scst_dlm_remove_locks(pr_dlm, ls); +unlock_dlm_pr: + scst_dlm_remove_lock(ls, &pr_lksb, PR_LOCK); + mutex_unlock(&pr_dlm->ls_mutex); + + cancel_work_sync(&pr_dlm->copy_from_dlm_work); + cancel_work_sync(&pr_dlm->copy_to_dlm_work); + cancel_work_sync(&pr_dlm->lvb_upd_work); + cancel_work_sync(&pr_dlm->reread_lvb_work); + + release_lockspace(ls, pr_dlm->cl_dev_id); + goto out_unlock_ls_cr; +} + +static bool scst_dlm_pr_is_set(struct scst_device *dev) +{ + get_lockspace(dev); + return dev->pr_is_set; +} + +static void scst_dlm_pr_write_lock(struct scst_device *dev, + struct scst_lksb *pr_lksb) +{ + struct scst_pr_dlm_data *const pr_dlm = dev->pr_dlm; + dlm_lockspace_t *ls; + + memset(pr_lksb, 0, sizeof(*pr_lksb)); + + ls = get_lockspace(dev); + if (!ls) + goto out; + + scst_dlm_lock_wait(ls, DLM_LOCK_EX, pr_lksb, 0, PR_LOCK, NULL); + if (pr_lksb->lksb.sb_lkid) { + scst_pr_toggle_lock(pr_dlm, ls, PR_POST_UPDATE_LOCK); + scst_pr_toggle_lock(pr_dlm, ls, PR_PRE_UPDATE_LOCK); + scst_dlm_lock_wait(ls, DLM_LOCK_PW, + &pr_dlm->data_lksb, + DLM_LKF_CONVERT | DLM_LKF_VALBLK, + PR_DATA_LOCK, NULL); + } + +out: + /* + * Note: invoking scst_copy_from_dlm(dev) here is not necessary + * because that function is already invoked after joining the + * lockspace and from inside post_bast(). + */ + scst_pr_write_lock(dev); +} + +static void scst_dlm_pr_write_unlock(struct scst_device *dev, + struct scst_lksb *pr_lksb) +{ + struct scst_pr_dlm_data *const pr_dlm = dev->pr_dlm; + dlm_lockspace_t *ls = pr_dlm->ls; + + scst_pr_write_unlock(dev); + + if (!pr_lksb->lksb.sb_lkid) + return; + + scst_copy_to_dlm(dev, ls); + scst_dlm_lock_wait(ls, DLM_LOCK_CR, &pr_dlm->data_lksb, + DLM_LKF_CONVERT | DLM_LKF_VALBLK, PR_DATA_LOCK, + NULL); + scst_pr_toggle_lock(pr_dlm, ls, PR_POST_UPDATE_LOCK); + scst_dlm_unlock_wait(ls, pr_lksb); +} + +static bool scst_dlm_reserved(struct scst_device *dev) +{ + EXTRACHECKS_BUG_ON(in_irq() || irqs_disabled()); + + get_lockspace(dev); + return dev->reserved_by || dev->pr_dlm->reserved_by_nodeid; +} + +static void scst_dlm_res_lock(struct scst_device *dev, + struct scst_lksb *pr_lksb) + __acquires(&dev->dev_lock) +{ + struct scst_pr_dlm_data *const pr_dlm = dev->pr_dlm; + dlm_lockspace_t *ls; + + EXTRACHECKS_BUG_ON(in_irq() || irqs_disabled()); + memset(pr_lksb, 0, sizeof(*pr_lksb)); + ls = get_lockspace(dev); + if (!ls) + goto out; + + scst_dlm_lock_wait(ls, DLM_LOCK_EX, pr_lksb, 0, PR_LOCK, NULL); + if (pr_lksb->lksb.sb_lkid) { + scst_dlm_lock_wait(ls, DLM_LOCK_PW, &pr_dlm->data_lksb, + DLM_LKF_CONVERT | DLM_LKF_VALBLK, + PR_DATA_LOCK, NULL); + } + +out: + spin_lock_bh(&dev->dev_lock); +} + +static void scst_dlm_res_unlock(struct scst_device *dev, + struct scst_lksb *pr_lksb) + __releases(&dev->dev_lock) +{ + struct scst_pr_dlm_data *const pr_dlm = dev->pr_dlm; + dlm_lockspace_t *ls = pr_dlm->ls; + struct pr_lvb *lvb = (void *)pr_dlm->lvb; + bool update_lvb; + + spin_unlock_bh(&dev->dev_lock); + + if (!pr_lksb->lksb.sb_lkid) + return; + + update_lvb = (be32_to_cpu(lvb->reserved_by_nodeid) != + pr_dlm->reserved_by_nodeid); + + if (update_lvb) + scst_copy_to_dlm(dev, ls); + scst_dlm_lock_wait(ls, DLM_LOCK_CR, &pr_dlm->data_lksb, + DLM_LKF_CONVERT | DLM_LKF_VALBLK, PR_DATA_LOCK, + NULL); + if (update_lvb) { + scst_pr_toggle_lock(pr_dlm, ls, PR_PRE_UPDATE_LOCK); + scst_pr_toggle_lock(pr_dlm, ls, PR_POST_UPDATE_LOCK); + } + scst_dlm_unlock_wait(ls, pr_lksb); +} + +static bool scst_dlm_is_rsv_holder(struct scst_device *dev, + struct scst_session *sess) +{ + return dev->reserved_by == sess; +} + +static bool scst_dlm_is_not_rsv_holder(struct scst_device *dev, + struct scst_session *sess) +{ + return dev->pr_dlm->reserved_by_nodeid && dev->reserved_by != sess; +} + +static void scst_dlm_reserve(struct scst_device *dev, struct scst_session *sess) +{ + dev->reserved_by = sess; + dev->pr_dlm->reserved_by_nodeid = sess ? dev->pr_dlm->local_nodeid : 0; +} + +static void scst_dlm_pre_bast(void *bastarg, int mode) +{ + struct scst_lksb *pre_lksb = bastarg; + struct scst_pr_dlm_data *pr_dlm = pre_lksb->pr_dlm; + const bool join = pre_lksb == &pr_dlm->pre_join_lksb; + + /* An AST must not block, so execute further work asynchronously. */ + if (join) + queue_work(pr_dlm->to_wq, &pr_dlm->pre_join_work); + else + queue_work(pr_dlm->from_wq, &pr_dlm->pre_upd_work); +} + +static void scst_pre_join_work(struct work_struct *work) +{ + struct scst_pr_dlm_data *pr_dlm = container_of(work, + struct scst_pr_dlm_data, pre_join_work); + dlm_lockspace_t *ls; + + mutex_lock(&pr_dlm->ls_mutex); + ls = pr_dlm->ls; + if (ls) { + scst_dlm_lock_wait(ls, DLM_LOCK_EX, &pr_dlm->post_join_lksb, + DLM_LKF_CONVERT, NULL, scst_dlm_post_bast); + scst_dlm_lock_wait(ls, DLM_LOCK_NL, &pr_dlm->pre_join_lksb, + DLM_LKF_CONVERT, NULL, scst_dlm_pre_bast); + } + mutex_unlock(&pr_dlm->ls_mutex); +} + +static void scst_pre_upd_work(struct work_struct *work) +{ + struct scst_pr_dlm_data *pr_dlm = container_of(work, + struct scst_pr_dlm_data, pre_upd_work); + dlm_lockspace_t *ls; + + mutex_lock(&pr_dlm->ls_mutex); + ls = pr_dlm->ls; + if (ls) { + scst_dlm_lock_wait(ls, DLM_LOCK_EX, &pr_dlm->post_upd_lksb, + DLM_LKF_CONVERT, NULL, scst_dlm_post_bast); + scst_dlm_lock_wait(ls, DLM_LOCK_NL, &pr_dlm->pre_upd_lksb, + DLM_LKF_CONVERT, NULL, scst_dlm_pre_bast); + } + mutex_unlock(&pr_dlm->ls_mutex); +} + +static void scst_dlm_post_bast(void *bastarg, int mode) +{ + struct scst_lksb *post_lksb = bastarg; + struct scst_pr_dlm_data *pr_dlm = post_lksb->pr_dlm; + const bool join = post_lksb == &pr_dlm->post_join_lksb; + + /* An AST must not block, so execute further work asynchronously. */ + if (join) + queue_work(pr_dlm->to_wq, &pr_dlm->copy_to_dlm_work); + else + queue_work(pr_dlm->from_wq, &pr_dlm->copy_from_dlm_work); +} + +/* + * Note: the node that has invoked scst_trigger_lvb_update() holds PR_LOCK + * in EX mode and waits until this function has finished. + */ +static void scst_copy_to_dlm_work(struct work_struct *work) +{ + struct scst_pr_dlm_data *pr_dlm = container_of(work, + struct scst_pr_dlm_data, copy_to_dlm_work); + struct scst_device *dev = pr_dlm->dev; + dlm_lockspace_t *ls; + int res; + + PRINT_INFO("Copying PR state to the DLM"); + + mutex_lock(&pr_dlm->ls_mutex); + ls = pr_dlm->ls; + if (!ls) + goto unlock_ls; + scst_dlm_lock_wait(ls, DLM_LOCK_EX, &pr_dlm->pre_join_lksb, + DLM_LKF_CONVERT, NULL, scst_dlm_pre_bast); + res = scst_dlm_lock_wait(ls, DLM_LOCK_PW, &pr_dlm->data_lksb, + DLM_LKF_CONVERT | DLM_LKF_VALBLK, PR_DATA_LOCK, + NULL); + if (res < 0) { + PRINT_WARNING("dlm_lock(%s.%s) returned %d", dev->virt_name, + PR_DATA_LOCK, res); + goto unlock_pr; + } + + /* + * Note: whether or not the PR_DATA_LOCK LVB is valid does not matter + * here since we are going to overwrite it anyway. + */ + if (pr_dlm->data_lksb.lksb.sb_flags & DLM_SBF_VALNOTVALID) + PRINT_INFO("%s.%s LVB not valid\n", dev->virt_name, + PR_DATA_LOCK); + + scst_copy_to_dlm(dev, ls); + scst_dlm_lock_wait(ls, DLM_LOCK_CR, &pr_dlm->data_lksb, + DLM_LKF_CONVERT | DLM_LKF_VALBLK, PR_DATA_LOCK, + NULL); + +unlock_pr: + dlm_lock(ls, DLM_LOCK_NL, &pr_dlm->post_join_lksb.lksb, + DLM_LKF_CONVERT, NULL, 0, 0, scst_dlm_post_ast, + &pr_dlm->post_join_lksb, scst_dlm_post_bast); + + PRINT_INFO("Finished copying PR state to the DLM"); + + scst_dlm_update_nodeids(pr_dlm); + + scst_pr_toggle_lock(pr_dlm, ls, PR_POST_UPDATE_LOCK); + scst_pr_toggle_lock(pr_dlm, ls, PR_PRE_UPDATE_LOCK); + scst_pr_toggle_lock(pr_dlm, ls, PR_POST_UPDATE_LOCK); + +unlock_ls: + mutex_unlock(&pr_dlm->ls_mutex); + + PRINT_INFO("Finished notifying other nodes about the new PR state"); +} + +/* + * Note: the scst_copy_from_dlm() call below runs outside command context. It + * is protected against device removal because scst_pr_dlm_cleanup() is + * invoked before a device is removed and that last function waits until this + * function has finished and additionally prevents new invocations of this + * function. The scst_copy_from_dlm() call below is protected against tgt_dev + * addition or removal (e.g. due to a cable pull) because + * scst_pr_init_tgt_dev() and scst_pr_clear_tgt_dev() in scst_pres.c protect + * these manipulations by locking the PR data structures for writing. + */ +static void scst_copy_from_dlm_work(struct work_struct *work) +{ + struct scst_pr_dlm_data *pr_dlm = container_of(work, + struct scst_pr_dlm_data, copy_from_dlm_work); + struct scst_device *dev = pr_dlm->dev; + dlm_lockspace_t *ls; + int res = -ENOENT; + bool modified_lvb = false; + + mutex_lock(&pr_dlm->ls_mutex); + ls = pr_dlm->ls; + if (!ls) + goto unlock_ls; + scst_dlm_lock_wait(ls, DLM_LOCK_EX, &pr_dlm->pre_upd_lksb, + DLM_LKF_CONVERT, NULL, scst_dlm_pre_bast); + res = scst_dlm_lock_wait(ls, DLM_LOCK_PW, &pr_dlm->data_lksb, + DLM_LKF_CONVERT | DLM_LKF_VALBLK, PR_DATA_LOCK, + NULL); + if (res < 0) { + PRINT_WARNING("dlm_lock(%s.%s) returned %d", dev->virt_name, + PR_DATA_LOCK, res); + goto unlock_pr; + } + if (pr_dlm->data_lksb.lksb.sb_flags & DLM_SBF_VALNOTVALID) { + PRINT_WARNING("%s.%s has an invalid lock value block", + dev->virt_name, PR_DATA_LOCK); + res = -EINVAL; + goto unlock_pr; + } + res = scst_copy_from_dlm(dev, ls, &modified_lvb); + scst_dlm_lock_wait(ls, DLM_LOCK_CR, &pr_dlm->data_lksb, + DLM_LKF_CONVERT | DLM_LKF_VALBLK, PR_DATA_LOCK, + NULL); + +unlock_pr: + dlm_lock(ls, DLM_LOCK_NL, &pr_dlm->post_upd_lksb.lksb, + DLM_LKF_CONVERT, NULL, 0, 0, scst_dlm_post_ast, + &pr_dlm->post_upd_lksb, scst_dlm_post_bast); + + scst_dlm_update_nodeids(pr_dlm); + +unlock_ls: + mutex_unlock(&pr_dlm->ls_mutex); + + if (res == -EINVAL) + queue_work(pr_dlm->upd_wq, &pr_dlm->lvb_upd_work); + else if (modified_lvb) + queue_work(pr_dlm->upd_wq, &pr_dlm->reread_lvb_work); +} + +static void scst_dlm_post_ast(void *astarg) +{ +} + +/* Tell other nodes to refresh their local state from the lock value blocks. */ +static void scst_reread_lvb_work(struct work_struct *work) +{ + struct scst_pr_dlm_data *pr_dlm = container_of(work, + struct scst_pr_dlm_data, reread_lvb_work); + dlm_lockspace_t *ls; + struct scst_lksb pr_lksb; + int res; + + mutex_lock(&pr_dlm->ls_mutex); + ls = pr_dlm->ls; + if (!ls) + goto unlock_ls; + memset(&pr_lksb, 0, sizeof(pr_lksb)); + res = scst_dlm_lock_wait(ls, DLM_LOCK_EX, &pr_lksb, 0, PR_LOCK, + NULL); + if (res >= 0) + scst_trigger_reread_lvb(pr_dlm, ls); + if (pr_lksb.lksb.sb_lkid) + scst_dlm_unlock_wait(ls, &pr_lksb); + +unlock_ls: + mutex_unlock(&pr_dlm->ls_mutex); +} + +/* Tell other nodes to update the DLM lock value blocks. */ +static void scst_lvb_upd_work(struct work_struct *work) +{ + struct scst_pr_dlm_data *pr_dlm = container_of(work, + struct scst_pr_dlm_data, lvb_upd_work); + dlm_lockspace_t *ls; + struct scst_lksb lksb; + int res; + + mutex_lock(&pr_dlm->ls_mutex); + ls = pr_dlm->ls; + if (!ls) + goto unlock_ls; + memset(&lksb, 0, sizeof(lksb)); + res = scst_dlm_lock_wait(ls, DLM_LOCK_EX, &lksb, 0, PR_LOCK, NULL); + if (res >= 0) + scst_trigger_lvb_update(pr_dlm, ls); + if (lksb.lksb.sb_lkid) + scst_dlm_unlock_wait(ls, &lksb); + +unlock_ls: + mutex_unlock(&pr_dlm->ls_mutex); +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0) +static struct workqueue_struct * +alloc_workqueue_backport(const char *fmt, unsigned flags, unsigned max_active, + ...) +{ + struct workqueue_struct *wq = NULL; + va_list ap; + char *name; + + va_start(ap, fmt); + name = kvasprintf(GFP_KERNEL, fmt, ap); + va_end(ap); + if (name) + wq = alloc_workqueue(name, flags, max_active); + kfree(name); + return wq; +} +#undef alloc_workqueue +#define alloc_workqueue alloc_workqueue_backport +#endif + +/* + * Caller must ensure that no commands are being executed for device @dev, + * e.g. by suspending commands before calling this function. + */ +static int scst_pr_dlm_init(struct scst_device *dev, const char *cl_dev_id) +{ + int res = -ENOMEM; + + compile_time_size_checks(); + dev->pr_dlm = kzalloc(sizeof(*dev->pr_dlm), GFP_KERNEL); + if (!dev->pr_dlm) + goto out; + dev->pr_dlm->dev = dev; + mutex_init(&dev->pr_dlm->ls_cr_mutex); + mutex_init(&dev->pr_dlm->ls_mutex); + dev->pr_dlm->data_lksb.lksb.sb_lvbptr = dev->pr_dlm->lvb; + INIT_WORK(&dev->pr_dlm->pre_join_work, scst_pre_join_work); + INIT_WORK(&dev->pr_dlm->pre_upd_work, scst_pre_upd_work); + INIT_WORK(&dev->pr_dlm->copy_from_dlm_work, scst_copy_from_dlm_work); + INIT_WORK(&dev->pr_dlm->copy_to_dlm_work, scst_copy_to_dlm_work); + INIT_WORK(&dev->pr_dlm->lvb_upd_work, scst_lvb_upd_work); + INIT_WORK(&dev->pr_dlm->reread_lvb_work, scst_reread_lvb_work); + dev->pr_dlm->latest_lscr_attempt = jiffies - 100 * HZ; + + res = -ENOMEM; + dev->pr_dlm->cl_dev_id = kstrdup(cl_dev_id, GFP_KERNEL); + if (!dev->pr_dlm->cl_dev_id) + goto err_free; + + dev->pr_dlm->from_wq = alloc_ordered_workqueue("%s_from_dlm", 0, + dev->virt_name); + if (IS_ERR(dev->pr_dlm->from_wq)) { + res = PTR_ERR(dev->pr_dlm->from_wq); + dev->pr_dlm->from_wq = NULL; + goto err_free; + } + + dev->pr_dlm->to_wq = alloc_ordered_workqueue("%s_to_dlm", 0, + dev->virt_name); + if (IS_ERR(dev->pr_dlm->to_wq)) { + res = PTR_ERR(dev->pr_dlm->to_wq); + dev->pr_dlm->to_wq = NULL; + goto err_free; + } + + dev->pr_dlm->upd_wq = alloc_ordered_workqueue("%s_upd_dlm", 0, + dev->virt_name); + if (IS_ERR(dev->pr_dlm->upd_wq)) { + res = PTR_ERR(dev->pr_dlm->upd_wq); + dev->pr_dlm->upd_wq = NULL; + goto err_free; + } + + res = 0; + +out: + return res; + +err_free: + scst_pr_dlm_cleanup(dev); + goto out; +} + +/* + * Note: The caller must ensure that get_lockspace() is not invoked + * concurrently with scst_pr_dlm_cleanup(). This can be realized by suspending + * command execution and by holding scst_mutex. The get_lockspace() callers are: + * - scst_dlm_pr_is_set(); + * - scst_dlm_pr_write_lock(); + * - scst_dlm_reserved(); + * - scst_dlm_res_lock(). + * The first three functions are invoked from command context only. The last + * function is either invoked from command context or is invoked with + * scst_mutex held (from scst_clear_reservation(), + * scst_reassign_persistent_sess_states() and scst_obtain_device_parameters()). + */ +static void scst_pr_dlm_cleanup(struct scst_device *dev) +{ + struct scst_pr_dlm_data *const pr_dlm = dev->pr_dlm; + dlm_lockspace_t *ls; + struct scst_lksb pr_lksb; + + if (!pr_dlm) + return; + ls = pr_dlm->ls; + if (ls) { + memset(&pr_lksb, 0, sizeof(pr_lksb)); + + mutex_lock(&pr_dlm->ls_mutex); + scst_dlm_lock_wait(ls, DLM_LOCK_EX, &pr_lksb, 0, PR_LOCK, NULL); + scst_dlm_remove_locks(pr_dlm, ls); + scst_dlm_unlock_wait(ls, &pr_lksb); + pr_dlm->ls = NULL; + mutex_unlock(&pr_dlm->ls_mutex); + + if (pr_dlm->from_wq) + cancel_work_sync(&pr_dlm->copy_from_dlm_work); + if (pr_dlm->to_wq) + cancel_work_sync(&pr_dlm->copy_to_dlm_work); + if (pr_dlm->upd_wq) { + cancel_work_sync(&pr_dlm->lvb_upd_work); + cancel_work_sync(&pr_dlm->reread_lvb_work); + } + release_lockspace(ls, pr_dlm->cl_dev_id); + } + if (pr_dlm->upd_wq) + destroy_workqueue(pr_dlm->upd_wq); + if (pr_dlm->to_wq) + destroy_workqueue(pr_dlm->to_wq); + if (pr_dlm->from_wq) + destroy_workqueue(pr_dlm->from_wq); + kfree(pr_dlm->nodeid); + kfree(pr_dlm->cl_dev_id); + kfree(pr_dlm); + dev->pr_dlm = NULL; +} + +const struct scst_cl_ops scst_dlm_cl_ops = { + .pr_init = scst_pr_dlm_init, + .pr_cleanup = scst_pr_dlm_cleanup, + .pr_is_set = scst_dlm_pr_is_set, + .pr_init_reg = scst_dlm_pr_init_reg, + .pr_rm_reg = scst_dlm_pr_rm_reg, + .pr_write_lock = scst_dlm_pr_write_lock, + .pr_write_unlock = scst_dlm_pr_write_unlock, + .reserved = scst_dlm_reserved, + .res_lock = scst_dlm_res_lock, + .res_unlock = scst_dlm_res_unlock, + .is_rsv_holder = scst_dlm_is_rsv_holder, + .is_not_rsv_holder = scst_dlm_is_not_rsv_holder, + .reserve = scst_dlm_reserve, +}; diff --git a/scst/src/scst_dlm.h b/scst/src/scst_dlm.h new file mode 100644 index 000000000..0e0956060 --- /dev/null +++ b/scst/src/scst_dlm.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2013 - 2014 Fusion-io, Inc. All rights reserved. + * Copyright (C) 2014 - 2015 SanDisk Corporation. + * + * Synchronization of persistent registration data with DLM lock value blocks. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2 + * of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __SCST_PRES_DLM_H +#define __SCST_PRES_DLM_H + +#include +#include +#include + +#define SCST_DLM_LOCKSPACE_PFX "scst-" + +/* + * DLM lock names + */ +#define PR_LOCK "pr" +#define PR_DATA_LOCK "pr_data" +#define PR_PRE_JOIN_LOCK "pr_pre_join_%d" +#define PR_POST_JOIN_LOCK "pr_post_join_%d" +#define PR_PRE_UPDATE_LOCK "pr_pre_%d" +#define PR_POST_UPDATE_LOCK "pr_post_%d" +#define PR_REG_LOCK "pr_reg_%02d" + +/* + * Data members needed for managing PR data via the DLM. + * + * Lock order when using the DLM (from outer to inner): + * - scst_mutex; + * - ls_cr_mutex; + * - ls_mutex; + * - PR_LOCK; + * - PR_PRE_UPDATE_LOCK, PR_POST_UPDATE_LOCK, PR_PRE_JOIN_LOCK, + * PR_POST_JOIN_LOCK; + * - PR_DATA_LOCK; + * - PR_REG_LOCK; + * - dev_pr_mutex / dev_lock. + */ +struct scst_pr_dlm_data { + /* Backpointer to the SCST device. */ + struct scst_device *dev; + + /* Lockspace name suffix. */ + const char *cl_dev_id; + + /* Mutex that protects initialization of the lockspace pointer. */ + struct mutex ls_cr_mutex; + + /* Mutex that protects the lock status blocks. */ + struct mutex ls_mutex; + + /* + * Pointer to the DLM lockspace that contains the persistent + * reservation and SPC-2 reservation data for device @dev. + */ + dlm_lockspace_t *ls; + + /* Time of the latest lockspace creation attempt. */ + unsigned long latest_lscr_attempt; + + /* Corosync node ID of the local node. */ + uint32_t local_nodeid; + + /* Number of elements in the nodeid array. */ + int participants; + + /* Corosync cluster node ID's. Protected by ls_mutex. */ + uint32_t *nodeid; + + /* Workqueue for copy_from_dlm_work. */ + struct workqueue_struct *from_wq; + /* Workqueue for copy_to_dlm_work. */ + struct workqueue_struct *to_wq; + /* Workqueue for lvb_upd_work. */ + struct workqueue_struct *upd_wq; + + struct work_struct pre_join_work; + struct work_struct pre_upd_work; + struct work_struct copy_from_dlm_work; + struct work_struct copy_to_dlm_work; + struct work_struct lvb_upd_work; + struct work_struct reread_lvb_work; + + /* + * DLM lock IDs of the locks used for persistent reservation data and + * the associated notification protocol. + */ + struct scst_lksb pre_join_lksb; + struct scst_lksb post_join_lksb; + struct scst_lksb data_lksb; + struct scst_lksb pre_upd_lksb; + struct scst_lksb post_upd_lksb; + + /* PR_DATA_LOCK LVB. */ + uint8_t lvb[PR_DLM_LVB_LEN]; + + /* SPC-2 reservation state information. */ + uint32_t reserved_by_nodeid; +}; + +/** + * struct pr_lvb - PR_DATA_LOCK LVB data format + * @nr_registrants: number of reservation keys that have been registered + * @pr_generation: persistent reservation generation + * @version: version of this structure + * @pr_is_set: whether the device has been reserved persistently + * @pr_type: persistent reservation type + * @pr_scope: persistent reservation scope + * @pr_aptpl: persistent reservation APTPL + * @reserved_by_nodeid: Corosync node ID of the node holding an SPC-2 + * reservation. Zero if no SPC-2 reservation is held. + */ +struct pr_lvb { + __be32 nr_registrants; + __be32 pr_generation; + u8 version; + u8 pr_is_set; + u8 pr_type; + u8 pr_scope; + u8 pr_aptpl; + u8 reserved[3]; + __be32 reserved_by_nodeid; +}; + +/** + * struct pr_reg_lvb - PR_REG_LOCK LVB data format + * @key: reservation key + * @rel_tgt_id: relative target id + * @version: version of this structure + * @is_holder: whether or not holding the reservation + * @tid: transport ID - up to 228 bytes for iSCSI + */ +struct pr_reg_lvb { + __be64 key; + __be16 rel_tgt_id; + u8 version; + u8 is_holder; + u8 tid[228]; +}; + +#endif /* __SCST_PRES_DLM_H */ diff --git a/scst/src/scst_lib.c b/scst/src/scst_lib.c index 9f7118c73..f9f8139d5 100644 --- a/scst/src/scst_lib.c +++ b/scst/src/scst_lib.c @@ -6090,17 +6090,18 @@ out: static void scst_clear_reservation(struct scst_tgt_dev *tgt_dev) { struct scst_device *dev = tgt_dev->dev; + struct scst_lksb pr_lksb; int release = 0; TRACE_ENTRY(); - spin_lock_bh(&dev->dev_lock); + scst_res_lock(dev, &pr_lksb); if (scst_is_reservation_holder(dev, tgt_dev->sess)) { /* This is one who holds the reservation */ scst_clear_dev_reservation(dev); release = 1; } - spin_unlock_bh(&dev->dev_lock); + scst_res_unlock(dev, &pr_lksb); if (release) scst_send_release(dev); @@ -11729,11 +11730,12 @@ static bool __scst_dev_check_set_UA(struct scst_device *dev, void scst_dev_check_set_UA(struct scst_device *dev, struct scst_cmd *exclude, const uint8_t *sense, int sense_len) { + struct scst_lksb lksb; bool rc; - spin_lock_bh(&dev->dev_lock); + scst_res_lock(dev, &lksb); rc = __scst_dev_check_set_UA(dev, exclude, sense, sense_len); - spin_unlock_bh(&dev->dev_lock); + scst_res_unlock(dev, &lksb); if (rc) scst_unblock_aborted_cmds(NULL, NULL, dev, false); @@ -12480,6 +12482,7 @@ void scst_reassign_retained_sess_states(struct scst_session *new_sess, list_for_each_entry(dev, &scst_dev_list, dev_list_entry) { struct scst_tgt_dev *tgt_dev; struct scst_tgt_dev *new_tgt_dev = NULL, *old_tgt_dev = NULL; + struct scst_lksb pr_lksb; TRACE_DBG("Processing dev %s", dev->virt_name); @@ -12506,11 +12509,13 @@ void scst_reassign_retained_sess_states(struct scst_session *new_sess, /** Reassign regular reservations **/ + scst_res_lock(dev, &pr_lksb); if (scst_is_reservation_holder(dev, old_sess)) { scst_reserve_dev(dev, new_sess); TRACE_DBG("Reservation reassigned from old_tgt_dev %p " "to new_tgt_dev %p", old_tgt_dev, new_tgt_dev); } + scst_res_unlock(dev, &pr_lksb); /** Reassign PRs **/ diff --git a/scst/src/scst_no_dlm.c b/scst/src/scst_no_dlm.c new file mode 100644 index 000000000..2c6d639d4 --- /dev/null +++ b/scst/src/scst_no_dlm.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2013 - 2014 Fusion-io, Inc. All rights reserved. + * Copyright (C) 2014 - 2015 SanDisk Corporation. + * + * Synchronization framework of persistent registration data without DLM lock. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2 + * of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifdef INSIDE_KERNEL_TREE +#include +#include +#else +#include "scst.h" +#include "scst_const.h" +#endif +#include "scst_priv.h" +#include "scst_pres.h" + +static int scst_no_dlm_pr_init(struct scst_device *dev, const char *cl_dev_id) +{ + return 0; +} + +static void scst_no_dlm_pr_cleanup(struct scst_device *dev) +{ +} + +static bool scst_no_dlm_pr_is_set(struct scst_device *dev) +{ + return dev->pr_is_set; +} + +static void scst_no_dlm_pr_init_reg(struct scst_device *dev, + struct scst_dev_registrant *reg) +{ +} + +static void scst_no_dlm_pr_rm_reg(struct scst_device *dev, + struct scst_dev_registrant *reg) +{ +} + +static void scst_no_dlm_pr_write_lock(struct scst_device *dev, + struct scst_lksb *pr_lksb) +{ + scst_pr_write_lock(dev); +} + +static void scst_no_dlm_pr_write_unlock(struct scst_device *dev, + struct scst_lksb *pr_lksb) +{ + scst_pr_write_unlock(dev); +} + +static bool scst_no_dlm_reserved(struct scst_device *dev) +{ + return dev->reserved_by; +} + +static void scst_no_dlm_res_lock(struct scst_device *dev, + struct scst_lksb *pr_lksb) + __acquires(&dev->dev_lock) +{ + EXTRACHECKS_BUG_ON(in_irq() || irqs_disabled()); + spin_lock_bh(&dev->dev_lock); +} + +static void scst_no_dlm_res_unlock(struct scst_device *dev, + struct scst_lksb *pr_lksb) + __releases(&dev->dev_lock) +{ + spin_unlock_bh(&dev->dev_lock); +} + +static bool scst_no_dlm_is_rsv_holder(struct scst_device *dev, + struct scst_session *sess) +{ + EXTRACHECKS_BUG_ON(sess == NULL); + return dev->reserved_by == sess; +} + +static bool scst_no_dlm_is_not_rsv_holder(struct scst_device *dev, + struct scst_session *sess) +{ + EXTRACHECKS_BUG_ON(sess == NULL); + return dev->reserved_by && dev->reserved_by != sess; +} + +static void scst_no_dlm_reserve(struct scst_device *dev, + struct scst_session *sess) +{ + dev->reserved_by = sess; +} + +const struct scst_cl_ops scst_no_dlm_cl_ops = { + .pr_init = scst_no_dlm_pr_init, + .pr_cleanup = scst_no_dlm_pr_cleanup, + .pr_is_set = scst_no_dlm_pr_is_set, + .pr_init_reg = scst_no_dlm_pr_init_reg, + .pr_rm_reg = scst_no_dlm_pr_rm_reg, + .pr_write_lock = scst_no_dlm_pr_write_lock, + .pr_write_unlock = scst_no_dlm_pr_write_unlock, + .reserved = scst_no_dlm_reserved, + .res_lock = scst_no_dlm_res_lock, + .res_unlock = scst_no_dlm_res_unlock, + .is_rsv_holder = scst_no_dlm_is_rsv_holder, + .is_not_rsv_holder = scst_no_dlm_is_not_rsv_holder, + .reserve = scst_no_dlm_reserve, +}; diff --git a/scst/src/scst_pres.c b/scst/src/scst_pres.c index 6e6fcad9d..acc04481d 100644 --- a/scst/src/scst_pres.c +++ b/scst/src/scst_pres.c @@ -88,7 +88,7 @@ static inline void scst_assert_pr_mutex_held(struct scst_device *dev) } #endif -static inline int scst_tid_size(const uint8_t *tid) +int scst_tid_size(const uint8_t *tid) { sBUG_ON(tid == NULL); @@ -185,7 +185,7 @@ out_error: } /* Must be called under dev_pr_mutex */ -static inline void scst_pr_set_holder(struct scst_device *dev, +void scst_pr_set_holder(struct scst_device *dev, struct scst_dev_registrant *holder, uint8_t scope, uint8_t type) { scst_assert_pr_mutex_held(dev); @@ -336,7 +336,7 @@ static void scst_pr_find_registrants_list_key(struct scst_device *dev, } /* dev_pr_mutex must be locked */ -static struct scst_dev_registrant *scst_pr_find_reg( +struct scst_dev_registrant *scst_pr_find_reg( struct scst_device *dev, const uint8_t *transport_id, const uint16_t rel_tgt_id) { @@ -379,7 +379,7 @@ static void scst_pr_clear_reservation(struct scst_device *dev) } /* Must be called under dev_pr_mutex */ -static void scst_pr_clear_holder(struct scst_device *dev) +void scst_pr_clear_holder(struct scst_device *dev) { TRACE_ENTRY(); @@ -401,7 +401,7 @@ static void scst_pr_clear_holder(struct scst_device *dev) } /* Must be called under dev_pr_mutex */ -static struct scst_dev_registrant *scst_pr_add_registrant( +struct scst_dev_registrant *scst_pr_add_registrant( struct scst_device *dev, const uint8_t *transport_id, const uint16_t rel_tgt_id, __be64 key, bool dev_lock_locked) @@ -441,6 +441,8 @@ static struct scst_dev_registrant *scst_pr_add_registrant( goto out; } + dev->cl_ops->pr_init_reg(dev, reg); + reg->transport_id = kmemdup(transport_id, scst_tid_size(transport_id), gfp_flags); if (reg->transport_id == NULL) { @@ -496,7 +498,7 @@ out_free: } /* Must be called under dev_pr_mutex */ -static void scst_pr_remove_registrant(struct scst_device *dev, +void scst_pr_remove_registrant(struct scst_device *dev, struct scst_dev_registrant *reg) { TRACE_ENTRY(); @@ -510,6 +512,8 @@ static void scst_pr_remove_registrant(struct scst_device *dev, list_del(®->dev_registrants_list_entry); + dev->cl_ops->pr_rm_reg(dev, reg); + if (scst_pr_is_holder(dev, reg)) scst_pr_clear_holder(dev); @@ -1142,6 +1146,7 @@ out: int scst_pr_init(struct scst_device *dev) { mutex_init(&dev->dev_pr_mutex); + dev->cl_ops = &scst_no_dlm_cl_ops; dev->pr_generation = 0; dev->pr_is_set = 0; dev->pr_holder = NULL; @@ -1155,8 +1160,41 @@ int scst_pr_init(struct scst_device *dev) /* Free the resources allocated by scst_pr_init(). */ void scst_pr_cleanup(struct scst_device *dev) { + dev->cl_ops->pr_cleanup(dev); } +/* Caller must hold scst_mutex and activity must be suspended. */ +int scst_pr_set_cluster_mode(struct scst_device *dev, bool cluster_mode, + const char *cl_dev_id) +{ + bool cluster_mode_enabled = false; + int res = 0; + +#if defined(CONFIG_DLM) || defined(CONFIG_DLM_MODULE) + cluster_mode_enabled = dev->cl_ops == &scst_dlm_cl_ops; + + if (cluster_mode_enabled == cluster_mode) + goto out; + + PRINT_INFO("%s: changing cluster_mode from %d into %d", dev->virt_name, + cluster_mode_enabled, cluster_mode); + dev->cl_ops->pr_cleanup(dev); + dev->cl_ops = cluster_mode ? &scst_dlm_cl_ops : &scst_no_dlm_cl_ops; + res = dev->cl_ops->pr_init(dev, cl_dev_id); + if (res) { + PRINT_ERROR("%s: changing cluster_mode into %d failed: %d", + dev->virt_name, cluster_mode, res); + dev->cl_ops = &scst_no_dlm_cl_ops; + } +#else + res = cluster_mode ? -ENOTSUPP : 0; +#endif + +out: + return res; +} +EXPORT_SYMBOL(scst_pr_set_cluster_mode); + /* Must be called under dev_pr_mutex or before dev is on the device list. */ int scst_pr_init_dev(struct scst_device *dev) { diff --git a/scst/src/scst_pres.h b/scst/src/scst_pres.h index 3cbecacc5..2decf5b4c 100644 --- a/scst/src/scst_pres.h +++ b/scst/src/scst_pres.h @@ -79,6 +79,11 @@ static inline void scst_pr_read_unlock(struct scst_device *dev) mutex_unlock(&dev->dev_pr_mutex); } +static inline void lockdep_assert_pr_read_lock_held(struct scst_device *dev) +{ + lockdep_assert_held(&dev->dev_pr_mutex); +} + static inline void scst_pr_write_lock(struct scst_device *dev) { mutex_lock(&dev->dev_pr_mutex); @@ -89,6 +94,11 @@ static inline void scst_pr_write_unlock(struct scst_device *dev) mutex_unlock(&dev->dev_pr_mutex); } +static inline void lockdep_assert_pr_write_lock_held(struct scst_device *dev) +{ + lockdep_assert_held(&dev->dev_pr_mutex); +} + int scst_pr_set_file_name(struct scst_device *dev, char **prev, const char *fmt, ...) __printf(3, 4); @@ -120,6 +130,21 @@ void scst_pr_report_caps(struct scst_cmd *cmd, uint8_t *buffer, int buffer_size) void scst_pr_read_full_status(struct scst_cmd *cmd, uint8_t *buffer, int buffer_size); +int scst_tid_size(const uint8_t *tid); +struct scst_dev_registrant *scst_pr_find_reg(struct scst_device *dev, + const uint8_t *transport_id, const uint16_t rel_tgt_id); +struct scst_dev_registrant *scst_pr_add_registrant(struct scst_device *dev, + const uint8_t *transport_id, + const uint16_t rel_tgt_id, + __be64 key, + bool dev_lock_locked); +void scst_pr_remove_registrant(struct scst_device *dev, + struct scst_dev_registrant *reg); +void scst_pr_set_holder(struct scst_device *dev, + struct scst_dev_registrant *holder, uint8_t scope, + uint8_t type); +void scst_pr_clear_holder(struct scst_device *dev); + #ifndef CONFIG_SCST_PROC void scst_pr_sync_device_file(struct scst_tgt_dev *tgt_dev, struct scst_cmd *cmd); #endif diff --git a/scst/src/scst_priv.h b/scst/src/scst_priv.h index 33056bd41..7d4ef5b33 100644 --- a/scst/src/scst_priv.h +++ b/scst/src/scst_priv.h @@ -170,6 +170,9 @@ extern struct list_head scst_dev_type_list; extern struct list_head scst_virtual_dev_type_list; extern wait_queue_head_t scst_dev_cmd_waitQ; +extern const struct scst_cl_ops scst_no_dlm_cl_ops; +extern const struct scst_cl_ops scst_dlm_cl_ops; + #ifdef CONFIG_SCST_PROC extern struct list_head scst_acg_list; extern struct scst_acg *scst_default_acg; @@ -398,6 +401,19 @@ void scst_free_session_callback(struct scst_session *sess); void scst_check_retries(struct scst_tgt *tgt); +static inline int scst_dlm_new_lockspace(const char *name, int namelen, + dlm_lockspace_t **lockspace, + uint32_t flags, + int lvblen) +{ +#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0) + return dlm_new_lockspace(name, namelen, lockspace, flags, lvblen); +#else + return dlm_new_lockspace(name, NULL, flags, lvblen, NULL, NULL, NULL, + lockspace); +#endif +} + #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 30) static inline int scst_exec_req(struct scsi_device *sdev, const unsigned char *cmd, int cmd_len, int data_direction, @@ -599,7 +615,23 @@ void scst_acn_sysfs_del(struct scst_acn *acn); */ static inline bool scst_dev_reserved(struct scst_device *dev) { - return dev->reserved_by; + return dev->cl_ops->reserved(dev); +} + + +/* Protect SPC-2 reservation state against concurrent modifications. */ +static inline void scst_res_lock(struct scst_device *dev, + struct scst_lksb *pr_lksb) + __acquires(&dev->dev_lock) +{ + dev->cl_ops->res_lock(dev, pr_lksb); +} + +static inline void scst_res_unlock(struct scst_device *dev, + struct scst_lksb *pr_lksb) + __releases(&dev->dev_lock) +{ + dev->cl_ops->res_unlock(dev, pr_lksb); } /* @@ -610,7 +642,7 @@ static inline bool scst_is_reservation_holder(struct scst_device *dev, struct scst_session *sess) { EXTRACHECKS_BUG_ON(sess == NULL); - return dev->reserved_by == sess; + return dev->cl_ops->is_rsv_holder(dev, sess); } /* @@ -620,23 +652,22 @@ static inline bool scst_is_reservation_holder(struct scst_device *dev, static inline bool scst_is_not_reservation_holder(struct scst_device *dev, struct scst_session *sess) { - struct scst_session *reserved_by = dev->reserved_by; - EXTRACHECKS_BUG_ON(sess == NULL); - return reserved_by != NULL && reserved_by != sess; + return dev->cl_ops->is_not_rsv_holder(dev, sess); } static inline void scst_reserve_dev(struct scst_device *dev, struct scst_session *sess) { + lockdep_assert_held(&dev->dev_lock); EXTRACHECKS_BUG_ON(sess == NULL); - dev->reserved_by = sess; + dev->cl_ops->reserve(dev, sess); } static inline void scst_clear_dev_reservation(struct scst_device *dev) { lockdep_assert_held(&dev->dev_lock); - dev->reserved_by = NULL; + dev->cl_ops->reserve(dev, NULL); } void scst_tgt_dev_del_free_UA(struct scst_tgt_dev *tgt_dev, diff --git a/scst/src/scst_sysfs.c b/scst/src/scst_sysfs.c index 7a4a033d7..614ba3312 100644 --- a/scst/src/scst_sysfs.c +++ b/scst/src/scst_sysfs.c @@ -3139,12 +3139,16 @@ static ssize_t scst_dev_sysfs_pr_file_name_store(struct kobject *kobj, { struct scst_sysfs_work_item *work; struct scst_device *dev; - char *pr_file_name, *p; - int res = -ENOMEM; + char *pr_file_name = NULL, *p; + int res = -EPERM; bool def = false; dev = container_of(kobj, struct scst_device, dev_kobj); + if (dev->cluster_mode) + goto out; + + res = -ENOMEM; pr_file_name = kasprintf(GFP_KERNEL, "%.*s", (int)count, buf); if (!pr_file_name) { PRINT_ERROR("Unable to kasprintf() PR file name"); diff --git a/scst/src/scst_targ.c b/scst/src/scst_targ.c index d0216b238..44539099b 100644 --- a/scst/src/scst_targ.c +++ b/scst/src/scst_targ.c @@ -2379,6 +2379,7 @@ static int scst_reserve_local(struct scst_cmd *cmd) { int res = SCST_EXEC_NOT_COMPLETED; struct scst_device *dev; + struct scst_lksb pr_lksb; TRACE_ENTRY(); @@ -2419,14 +2420,14 @@ static int scst_reserve_local(struct scst_cmd *cmd) } } - spin_lock_bh(&dev->dev_lock); + scst_res_lock(dev, &pr_lksb); if (scst_is_not_reservation_holder(dev, cmd->sess)) { - spin_unlock_bh(&dev->dev_lock); + scst_res_unlock(dev, &pr_lksb); scst_set_cmd_error_status(cmd, SAM_STAT_RESERVATION_CONFLICT); goto out_done; } scst_reserve_dev(dev, cmd->sess); - spin_unlock_bh(&dev->dev_lock); + scst_res_unlock(dev, &pr_lksb); out: TRACE_EXIT_RES(res); @@ -2446,6 +2447,7 @@ static int scst_release_local(struct scst_cmd *cmd) { int res = SCST_EXEC_NOT_COMPLETED; struct scst_device *dev; + struct scst_lksb pr_lksb; TRACE_ENTRY(); @@ -2466,7 +2468,7 @@ static int scst_release_local(struct scst_cmd *cmd) } } - spin_lock_bh(&dev->dev_lock); + scst_res_lock(dev, &pr_lksb); /* * The device could be RELEASED behind us, if RESERVING session @@ -2489,7 +2491,7 @@ static int scst_release_local(struct scst_cmd *cmd) scst_clear_dev_reservation(dev); } - spin_unlock_bh(&dev->dev_lock); + scst_res_unlock(dev, &pr_lksb); if (res == SCST_EXEC_COMPLETED) goto out_done; @@ -2616,6 +2618,7 @@ static int scst_persistent_reserve_out_local(struct scst_cmd *cmd) int action; uint8_t *buffer; int buffer_size; + struct scst_lksb pr_lksb; bool aborted = false; TRACE_ENTRY(); @@ -2651,7 +2654,7 @@ static int scst_persistent_reserve_out_local(struct scst_cmd *cmd) if (unlikely(buffer_size <= 0)) goto out_done; - scst_pr_write_lock(dev); + dev->cl_ops->pr_write_lock(dev, &pr_lksb); /* * Check if tgt_dev already registered. Also by this check we make @@ -2742,7 +2745,7 @@ static int scst_persistent_reserve_out_local(struct scst_cmd *cmd) res = SCST_EXEC_NOT_COMPLETED; out_unlock: - scst_pr_write_unlock(dev); + dev->cl_ops->pr_write_unlock(dev, &pr_lksb); scst_put_buf_full(cmd, buffer); @@ -2816,7 +2819,7 @@ int __scst_check_local_events(struct scst_cmd *cmd, bool preempt_tests_only) } if (!preempt_tests_only) { - if (dev->pr_is_set) { + if (dev->cl_ops->pr_is_set(dev)) { if (unlikely(!scst_pr_is_cmd_allowed(cmd))) { scst_set_cmd_error_status(cmd, SAM_STAT_RESERVATION_CONFLICT); @@ -5980,14 +5983,15 @@ static int scst_target_reset(struct scst_mgmt_cmd *mcmd) list_for_each_entry(acg_dev, &acg->acg_dev_list, acg_dev_list_entry) { struct scst_device *d; struct scst_tgt_dev *tgt_dev; + struct scst_lksb pr_lksb; int found = 0; dev = acg_dev->dev; - spin_lock_bh(&dev->dev_lock); + scst_res_lock(dev, &pr_lksb); scst_block_dev(dev); scst_process_reset(dev, mcmd->sess, NULL, mcmd, true); - spin_unlock_bh(&dev->dev_lock); + scst_res_unlock(dev, &pr_lksb); list_for_each_entry(tgt_dev, &dev->dev_tgt_dev_list, dev_tgt_dev_list_entry) { @@ -6076,6 +6080,7 @@ static int scst_lun_reset(struct scst_mgmt_cmd *mcmd) int res, rc; struct scst_tgt_dev *tgt_dev = mcmd->mcmd_tgt_dev; struct scst_device *dev = tgt_dev->dev; + struct scst_lksb pr_lksb; TRACE_ENTRY(); @@ -6084,10 +6089,10 @@ static int scst_lun_reset(struct scst_mgmt_cmd *mcmd) mcmd->needs_unblocking = 1; - spin_lock_bh(&dev->dev_lock); + scst_res_lock(dev, &pr_lksb); scst_block_dev(dev); scst_process_reset(dev, mcmd->sess, NULL, mcmd, true); - spin_unlock_bh(&dev->dev_lock); + scst_res_unlock(dev, &pr_lksb); scst_call_dev_task_mgmt_fn_received(mcmd, tgt_dev);