Cluster SCSI state sync support

From Bart Van Assche <Bart.VanAssche@sandisk.com>
Prepared with help from Pralay Dakua <Pralay.Dakua@sandisk.com>



git-svn-id: http://svn.code.sf.net/p/scst/svn/trunk@6573 d57e44dd-8a1f-0410-8b47-8ef2f437770f
This commit is contained in:
Vladislav Bolkhovitin
2015-11-06 03:23:46 +00:00
parent 5058156b26
commit 9d61dc4b03
16 changed files with 2261 additions and 39 deletions

View File

@@ -36,7 +36,9 @@ rtf: $(RTFS)
$(COMMAND)rtf $(<)
clean:
mv "Using the DLM as a Distributed In-Memory Database.pdf" "Using the DLM as a Distributed In-Memory Database.pdf_"
rm -f *.txt *.html *.tex *.dvi *.ps *.pdf *.info *.lyx *.rtf
mv "Using the DLM as a Distributed In-Memory Database.pdf_" "Using the DLM as a Distributed In-Memory Database.pdf"
extraclean: clean
rm -f *.orig *.rej

166
scst/README.dlm Normal file
View File

@@ -0,0 +1,166 @@
Synchronization of the Persistent Reservation Information via the DLM
=====================================================================
Introduction
------------
In an H.A. setup where multiple servers share data it is required that
the persistent reservation state is kept consistent across the cluster.
One possible approach is to use the DLM to keep the PR state synchronized
across nodes. Since the DLM can associate data with each DLM lock object,
DLM lock objects can be used to store PR data. The data that is associated
with a DLM lock object is called the Lock Value Block or LVB. The code in
scst_dlm.c uses the DLM to keep PR data synchronized across all nodes in
a cluster.
Software Components
-------------------
The following software components are needed by the code in scst_dlm.c:
* The DLM kernel driver (dlm.ko). This driver is only built if CONFIG_DLM
has been set.
* The DLM control daemon (dlm_controld.pcmk). This daemon passes cluster
node IDs and IP addresses to the DLM kernel driver via the configfs
interface of the DLM kernel driver.
* Corosync to manage cluster membership of the cluster nodes and to assign
a node ID to each cluster node.
* A facility to start the DLM control daemon, e.g. Pacemaker.
On most Linux distributions the software packages that contain this software
have the names kernel, dlm, corosync and pacemaker.
DLM Configuration
-----------------
The DLM kernel module supports the TCP and SCTP communication protocols. An
advantage of SCTP for H.A. purposes is that it supports multihoming. One of
these protocols can be selected via the -r <proto> option of dlm_controld.
That option can be set via the "args" argument of the Pacemaker dlm_controld
resource. For more information, see also:
* The dlm_controld(8) man page.
* In the "Pacemaker 1.1, Clusters from Scratch" guide, the section "Configure
the Cluster for the DLM".
* The dlm_controld resource agent: /usr/lib/ocf/resource.d/pacemaker/controld
Here is an example of how to set up a cluster with two nodes and how to
configure and start the DLM control daemon:
1. If a network switch is present between the two nodes, enable IPv4 multicast
on that switch.
2. Copy /etc/corosync/corosync.conf.example into /etc/corosync/corosync.conf
and edit that file.
3. If a file /etc/default/corosync exists, enable Corosync in that file.
4. Start Corosync:
systemctl start corosync || /etc/init.d/corosync start
5. Check that all configured Corosync rings have two members:
corosync-cfgtool -s && { corosync-cmapctl | grep members; }
6. Start pcsd:
systemctl start pcsd || /etc/init.d/pcsd start
7. Set up cluster authentication:
pcs cluster auth centos7-vm centos7b-vm
8. Start Pacemaker:
systemctl start pacemaker || /etc/init.d/pacemaker start
9. If the cluster has only two nodes, disable the Pacemaker quorum policy and
disable STONITH:
crm_attribute -t crm_config -n no-quorum-policy -v ignore
crm_attribute -t crm_config -n stonith-enabled -v false
10. Check the cluster status:
pcs status
11. Create a Pacemaker resource for dlm_controld:
pcs resource delete dlm
pcs resource create dlm ocf:pacemaker:controld \
args="-q0 -f0" allow_stonith_disabled=true \
op monitor timeout=60 \
--clone interleave=true
12. Check the Pacemaker status:
pcs status
Startup and Shutdown
--------------------
The startup sequence is as follows:
* Load and configure SCST with cluster_mode = 0 and with all target ports
disabled.
* Enable cluster mode for all SCST devices that can be accessed through more
than one cluster node:
for x in /sys/kernel/scst_tgt/handlers/*/*/; do
echo 1 >$x/cluster_mode &
done
wait
* Start Corosync and Pacemaker.
* Wait until Pacemaker has reached the idle state:
pacemaker_dc_status() {
local dc
dc="$(crmadmin -D 2>/dev/null | sed 's/Designated Controller is: //')"
[ -n "$dc" ] &&
crmadmin -S "$dc" 2>/dev/null |
sed 's/^Status of crmd@[^[:blank:]]*:[[:blank:]]\([^[:blank:]]*\).*/\1/'
}
for ((i=0;i<300;i++)); do
[ "$(pacemaker_dc_status)" = "S_IDLE" ] && break
sleep 1
done
* Enable SCST target ports.
* If no DLM resource has been configured in Pacemaker, start dlm_controld.pcmk
explicitly.
The proper shutdown order is as follows:
* Tell SCST to stop accepting SCSI commands and wait until all initiators have
logged out:
for x in $(find /sys/kernel/scst_tgt/targets/ -name enabled); do
echo 0 > $x &
done
wait
while ls -Ad /sys/kernel/scst_tgt/targets/*/*/sessions/* >/dev/null 2>&1; do
sleep 1
done
* Tell SCST to release the DLM lockspaces:
while grep -q '^1$' /sys/kernel/scst_tgt/devices/*/cluster_mode 2>/dev/null
do
for x in /sys/kernel/scst_tgt/devices/*/cluster_mode; do
{ [ -e "$x" ] && echo 0 > "$x"; } &
done
wait
sleep 1
done
* Stop Pacemaker and Corosync
* Unload the SCST kernel modules
* Unload the DLM kernel driver
Lockspace names
---------------
The names of the DLM lockspaces used by SCST follow the following pattern:
scst-<t10_dev_id> where t10_dev_id is the T10 device ID of the SCST device
associated with this lockspace.
Notes
-----
Since the lockspace name depends on the t10_dev_id it is not allowed to
change the t10_dev_id if cluster mode has been enabled.
Testing
-------
Two examples of test suites for the cluster PR support code are:
* The SCSI conformance tests in the libiscsi project.
* The Windows Cluster Validation Tests
(https://technet.microsoft.com/en-us/library/Cc726064.aspx).
To do
-----
Ensure that PREEMPT AND ABORT affects all cluster nodes instead of only the
cluster node that received this command.
See also
--------
* Bart Van Assche, Using the DLM as a distributed in-memory database, Linux
Plumbers North America, Seattle, August 20, 2015
(https://linuxplumbersconf.org/2015/ocw//system/presentations/2691/original/Using%20the%20DLM%20as%20a%20Distributed%20In-Memory%20Database.pdf).
* Andrew Beekhof, Pacemaker Configuration Explained, 2015
(http://clusterlabs.org/doc/en-US/Pacemaker/1.1/html/Pacemaker_Explained/).
* Andrew Beekhof, Clusters from Scratch, 2015
(http://clusterlabs.org/doc/en-US/Pacemaker/1.1-pcs/html/Clusters_from_Scratch/index.html).

View File

@@ -38,6 +38,7 @@
#include <linux/interrupt.h>
#include <linux/wait.h>
#include <linux/cpumask.h>
#include <linux/dlm.h>
#ifdef CONFIG_SCST_MEASURE_LATENCY
#include <linux/log2.h>
#endif
@@ -1991,6 +1992,18 @@ struct scst_cmd_threads {
int scst_set_thr_cpu_mask(struct scst_cmd_threads *cmd_threads,
cpumask_t *cpu_mask);
struct scst_pr_dlm_data;
/*
* DLM lock status block with completion for notifying completion of
* synchronous DLM lock operations.
*/
struct scst_lksb {
struct dlm_lksb lksb;
struct completion compl;
struct scst_pr_dlm_data *pr_dlm;
};
/*
* Used to execute cmd's in order of arrival, honoring SCSI task attributes
*/
@@ -2545,6 +2558,53 @@ struct scst_dev_registrant {
/* 2 auxiliary fields used to rollback changes for errors, etc. */
struct list_head aux_list_entry;
__be64 rollback_key;
/* For registrant information managed via the DLM. */
int dlm_idx;
struct scst_lksb lksb;
char lvb[PR_DLM_LVB_LEN];
};
/**
* struct scst_cl_ops - Encapsulation of behavior that depends on cluster mode
* @pr_init: Initialize resources needed by one of the functions below.
* @pr_cleanup: Free resources allocated by one of the functions below.
* @pr_is_set: Whether or not one of the registrants holds a reservation.
* @pr_init_reg: Cluster-specific registrant initialization.
* @pr_rm_reg: Cluster-specific registrant cleanup.
* @pr_write_lock: Lock the PR data structures for write access.
* @pr_write_unlock: Unlock the PR data structures for write access.
* @reserved: Whether an initiator holds an SPC-2 reservation.
* @res_lock: Protect the SPC-2 reservation state against concurrent
* modifications.
* @res_unlock: Counterpart of @res_lock.
* @is_rsv_holder: Whether session @sess holds an SPC-2 reservation on @dev.
* @is_not_rsv_holder: Whether another session than @sess holds an SPC-2
* reservation on @dev.
* @reserve: Apply an SPC-2 reservation for session @sess on @dev if
* @sess != NULL or clear that reservation if @ses == NULL.
*/
struct scst_cl_ops {
int (*pr_init)(struct scst_device *dev, const char *cl_dev_id);
void (*pr_cleanup)(struct scst_device *dev);
bool (*pr_is_set)(struct scst_device *dev);
void (*pr_init_reg)(struct scst_device *dev,
struct scst_dev_registrant *reg);
void (*pr_rm_reg)(struct scst_device *dev,
struct scst_dev_registrant *reg);
void (*pr_write_lock)(struct scst_device *dev,
struct scst_lksb *pr_lksb);
void (*pr_write_unlock)(struct scst_device *dev,
struct scst_lksb *pr_lksb);
bool (*reserved)(struct scst_device *dev);
void (*res_lock)(struct scst_device *dev, struct scst_lksb *pr_lksb);
void (*res_unlock)(struct scst_device *dev, struct scst_lksb *pr_lksb);
bool (*is_rsv_holder)(struct scst_device *dev,
struct scst_session *sess);
bool (*is_not_rsv_holder)(struct scst_device *dev,
struct scst_session *sess);
void (*reserve)(struct scst_device *dev, struct scst_session *sess);
};
/*
@@ -2725,6 +2785,9 @@ struct scst_device {
/* Set if reserved via the SPC-2 SCSI RESERVE command. */
struct scst_session *reserved_by;
/* Operations that depend on whether or not cluster mode is enabled */
const struct scst_cl_ops *cl_ops;
/**********************************************************************
* Persistent reservation fields. Protected as follows:
* - Reading PR data must be protected via scst_pr_read_lock() /
@@ -2745,12 +2808,21 @@ struct scst_device {
/* Whether or not pr_file_name has been modified via sysfs. */
unsigned int pr_file_name_is_set:1;
/*
* Whether or not the PR state must be synchronized with other cluster
* nodes.
*/
unsigned int cluster_mode:1;
/* Persistent reservation type */
uint8_t pr_type;
/* Persistent reservation scope */
uint8_t pr_scope;
/* Data structures for managing PR data via the DLM */
struct scst_pr_dlm_data *pr_dlm;
/* Mutex to protect PR operations */
struct mutex dev_pr_mutex;
@@ -5405,4 +5477,9 @@ void scst_path_put(struct nameidata *nd);
#endif
int scst_remove_file(const char *name);
int scst_pr_set_cluster_mode(struct scst_device *dev, bool cluster_mode,
const char *cl_dev_id);
int scst_pr_init_dev(struct scst_device *dev);
void scst_pr_clear_dev(struct scst_device *dev);
#endif /* __SCST_H */

View File

@@ -735,4 +735,8 @@ enum {
E_TGT_PRIV_NOT_YET_SET = EBUSY
};
/* Size of the lock value block in the DLM PR lockspace */
#define PR_DLM_LVB_LEN 256
#endif /* __SCST_CONST_H */

View File

@@ -48,6 +48,13 @@ scst-y += scst_sysfs.o
scst-y += scst_mem.o
scst-y += scst_debug.o
scst-y += scst_pres.o
scst-y += scst_no_dlm.o
ifdef CONFIG_DLM
scst-y += scst_dlm.o
endif
ifdef CONFIG_DLM_MODULE
scst-y += scst_dlm.o
endif
scst-y += scst_tg.o
obj-$(CONFIG_SCST) += scst.o dev_handlers/

View File

@@ -396,6 +396,10 @@ static ssize_t vdisk_sysfs_removable_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf);
static ssize_t vdev_sysfs_filename_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf);
static ssize_t vdev_sysfs_cluster_mode_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf);
static ssize_t vdev_sysfs_cluster_mode_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count);
static ssize_t vdisk_sysfs_resync_size_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count);
static ssize_t vdisk_sysfs_sync_store(struct kobject *kobj,
@@ -483,6 +487,9 @@ static struct kobj_attribute vdisk_removable_attr =
__ATTR(removable, S_IRUGO, vdisk_sysfs_removable_show, NULL);
static struct kobj_attribute vdisk_filename_attr =
__ATTR(filename, S_IRUGO, vdev_sysfs_filename_show, NULL);
static struct kobj_attribute vdisk_cluster_mode_attr =
__ATTR(cluster_mode, S_IWUSR|S_IRUGO, vdev_sysfs_cluster_mode_show,
vdev_sysfs_cluster_mode_store);
static struct kobj_attribute vdisk_resync_size_attr =
__ATTR(resync_size, S_IWUSR, NULL, vdisk_sysfs_resync_size_store);
static struct kobj_attribute vdisk_sync_attr =
@@ -540,6 +547,7 @@ static const struct attribute *vdisk_fileio_attrs[] = {
&vdisk_o_direct_attr.attr,
&vdisk_removable_attr.attr,
&vdisk_filename_attr.attr,
&vdisk_cluster_mode_attr.attr,
&vdisk_resync_size_attr.attr,
&vdisk_sync_attr.attr,
&vdev_t10_vend_id_attr.attr,
@@ -567,6 +575,7 @@ static const struct attribute *vdisk_blockio_attrs[] = {
&vdisk_removable_attr.attr,
&vdisk_rotational_attr.attr,
&vdisk_filename_attr.attr,
&vdisk_cluster_mode_attr.attr,
&vdisk_resync_size_attr.attr,
&vdisk_sync_attr.attr,
&vdev_t10_vend_id_attr.attr,
@@ -680,6 +689,7 @@ static struct scst_dev_type vdisk_file_devtype = {
"filename, "
"nv_cache, "
"o_direct, "
"cluster_mode, "
"read_only, "
"removable, "
"rotational, "
@@ -734,6 +744,7 @@ static struct scst_dev_type vdisk_blk_devtype = {
"dif_filename, "
"filename, "
"nv_cache, "
"cluster_mode, "
"read_only, "
"removable, "
"rotational, "
@@ -1648,6 +1659,9 @@ next:
if (vdev_saved_mode_pages_enabled)
vdev_load_mode_pages(virt_dev);
res = scst_pr_set_cluster_mode(dev, dev->cluster_mode,
virt_dev->t10_dev_id);
out:
TRACE_EXIT();
return res;
@@ -1664,6 +1678,8 @@ static void vdisk_detach(struct scst_device *dev)
TRACE_DBG("virt_id %d", dev->virt_id);
scst_pr_set_cluster_mode(dev, false, virt_dev->t10_dev_id);
PRINT_INFO("Detached virtual device %s (\"%s\")",
virt_dev->name, vdev_get_filename(virt_dev));
@@ -4437,14 +4453,16 @@ static int vdisk_ctrl_m_pg(unsigned char *p, int pcontrol,
*/
p[2] |= 7 << 5; /* TST */
#endif
p[2] |= 1 << 2; /* D_SENSE */
p[2] |= 1 << 3; /* DPICZ */
p[2] |= 1 << 4; /* TMF_ONLY */
p[3] |= 0xF << 4; /* QUEUE ALGORITHM MODIFIER */
p[3] |= 3 << 1; /* QErr */
p[4] |= 1 << 3; /* SWP */
p[5] |= 1 << 6; /* TAS */
p[5] |= 0 << 7; /* ATO */
if (!virt_dev->dev->cluster_mode) {
p[2] |= 1 << 2; /* D_SENSE */
p[2] |= 1 << 3; /* DPICZ */
p[2] |= 1 << 4; /* TMF_ONLY */
p[3] |= 0xF << 4; /* QUEUE ALGORITHM MODIFIER */
p[3] |= 3 << 1; /* QErr */
p[4] |= 1 << 3; /* SWP */
p[5] |= 1 << 6; /* TAS */
p[5] |= 0 << 7; /* ATO */
}
break;
case 2: /* default */
p[2] |= virt_dev->tst << 5;
@@ -4920,6 +4938,13 @@ static enum compl_status_e vdisk_exec_mode_select(struct vdisk_cmd_params *p)
TRACE_ENTRY();
virt_dev = cmd->dev->dh_priv;
if (cmd->dev->cluster_mode) {
PRINT_ERROR("MODE SELECT: not supported in cluster mode\n");
scst_set_cmd_error(cmd,
SCST_LOAD_SENSE(scst_sense_invalid_field_in_cdb));
goto out;
}
mselect_6 = (MODE_SELECT == cmd->cdb[0]);
type = cmd->dev->type;
@@ -8810,6 +8835,104 @@ out:
return res;
}
static ssize_t vdev_sysfs_cluster_mode_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct scst_device *dev = container_of(kobj, struct scst_device,
dev_kobj);
return sprintf(buf, "%d\n%s", dev->cluster_mode,
dev->cluster_mode ?
SCST_SYSFS_KEY_MARK "\n" : "");
}
static int vdev_sysfs_process_cluster_mode_store(
struct scst_sysfs_work_item *work)
{
struct scst_device *dev = work->dev;
struct scst_vdisk_dev *virt_dev;
long clm;
int res;
res = scst_suspend_activity(SCST_SUSPEND_TIMEOUT_USER);
if (res)
goto out;
res = mutex_lock_interruptible(&scst_mutex);
if (res)
goto resume;
/*
* This is safe since we hold a reference on dev_kobj and since
* scst_assign_dev_handler() waits until all dev_kobj references
* have been dropped before invoking .detach().
*/
virt_dev = dev->dh_priv;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 39)
res = kstrtol(work->buf, 0, &clm);
#else
res = strict_strtol(work->buf, 0, &clm);
#endif
if (res)
goto unlock;
res = -EINVAL;
if (clm < 0 || clm > 1)
goto unlock;
if (clm != dev->cluster_mode) {
res = scst_pr_set_cluster_mode(dev, clm, virt_dev->t10_dev_id);
if (res)
goto unlock;
dev->cluster_mode = clm;
} else {
res = 0;
}
unlock:
mutex_unlock(&scst_mutex);
resume:
scst_resume_activity();
out:
kobject_put(&dev->dev_kobj);
return res;
}
static ssize_t vdev_sysfs_cluster_mode_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
{
struct scst_device *dev = container_of(kobj, struct scst_device,
dev_kobj);
struct scst_sysfs_work_item *work;
char *arg;
int res;
TRACE_ENTRY();
res = -ENOMEM;
arg = kasprintf(GFP_KERNEL, "%.*s", (int)count, buf);
if (!arg)
goto out;
res = scst_alloc_sysfs_work(vdev_sysfs_process_cluster_mode_store,
false, &work);
if (res)
goto out;
work->dev = dev;
swap(work->buf, arg);
kobject_get(&dev->dev_kobj);
res = scst_sysfs_queue_wait_work(work);
if (res)
goto out;
res = count;
out:
kfree(arg);
TRACE_EXIT_RES(res);
return res;
}
static int vdisk_sysfs_process_resync_size_store(
struct scst_sysfs_work_item *work)
{
@@ -9165,6 +9288,10 @@ static ssize_t vdev_sysfs_t10_dev_id_store(struct kobject *kobj,
dev = container_of(kobj, struct scst_device, dev_kobj);
virt_dev = dev->dh_priv;
res = -EPERM;
if (dev->cluster_mode)
goto out;
write_lock(&vdisk_serial_rwlock);
if ((count > sizeof(virt_dev->t10_dev_id)) ||
@@ -9198,6 +9325,7 @@ static ssize_t vdev_sysfs_t10_dev_id_store(struct kobject *kobj,
out_unlock:
write_unlock(&vdisk_serial_rwlock);
out:
TRACE_EXIT_RES(res);
return res;
}

1458
scst/src/scst_dlm.c Normal file

File diff suppressed because it is too large Load Diff

154
scst/src/scst_dlm.h Normal file
View File

@@ -0,0 +1,154 @@
/*
* Copyright (c) 2013 - 2014 Fusion-io, Inc. All rights reserved.
* Copyright (C) 2014 - 2015 SanDisk Corporation.
*
* Synchronization of persistent registration data with DLM lock value blocks.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, version 2
* of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#ifndef __SCST_PRES_DLM_H
#define __SCST_PRES_DLM_H
#include <linux/dlm.h>
#include <linux/mutex.h>
#include <linux/workqueue.h>
#define SCST_DLM_LOCKSPACE_PFX "scst-"
/*
* DLM lock names
*/
#define PR_LOCK "pr"
#define PR_DATA_LOCK "pr_data"
#define PR_PRE_JOIN_LOCK "pr_pre_join_%d"
#define PR_POST_JOIN_LOCK "pr_post_join_%d"
#define PR_PRE_UPDATE_LOCK "pr_pre_%d"
#define PR_POST_UPDATE_LOCK "pr_post_%d"
#define PR_REG_LOCK "pr_reg_%02d"
/*
* Data members needed for managing PR data via the DLM.
*
* Lock order when using the DLM (from outer to inner):
* - scst_mutex;
* - ls_cr_mutex;
* - ls_mutex;
* - PR_LOCK;
* - PR_PRE_UPDATE_LOCK, PR_POST_UPDATE_LOCK, PR_PRE_JOIN_LOCK,
* PR_POST_JOIN_LOCK;
* - PR_DATA_LOCK;
* - PR_REG_LOCK;
* - dev_pr_mutex / dev_lock.
*/
struct scst_pr_dlm_data {
/* Backpointer to the SCST device. */
struct scst_device *dev;
/* Lockspace name suffix. */
const char *cl_dev_id;
/* Mutex that protects initialization of the lockspace pointer. */
struct mutex ls_cr_mutex;
/* Mutex that protects the lock status blocks. */
struct mutex ls_mutex;
/*
* Pointer to the DLM lockspace that contains the persistent
* reservation and SPC-2 reservation data for device @dev.
*/
dlm_lockspace_t *ls;
/* Time of the latest lockspace creation attempt. */
unsigned long latest_lscr_attempt;
/* Corosync node ID of the local node. */
uint32_t local_nodeid;
/* Number of elements in the nodeid array. */
int participants;
/* Corosync cluster node ID's. Protected by ls_mutex. */
uint32_t *nodeid;
/* Workqueue for copy_from_dlm_work. */
struct workqueue_struct *from_wq;
/* Workqueue for copy_to_dlm_work. */
struct workqueue_struct *to_wq;
/* Workqueue for lvb_upd_work. */
struct workqueue_struct *upd_wq;
struct work_struct pre_join_work;
struct work_struct pre_upd_work;
struct work_struct copy_from_dlm_work;
struct work_struct copy_to_dlm_work;
struct work_struct lvb_upd_work;
struct work_struct reread_lvb_work;
/*
* DLM lock IDs of the locks used for persistent reservation data and
* the associated notification protocol.
*/
struct scst_lksb pre_join_lksb;
struct scst_lksb post_join_lksb;
struct scst_lksb data_lksb;
struct scst_lksb pre_upd_lksb;
struct scst_lksb post_upd_lksb;
/* PR_DATA_LOCK LVB. */
uint8_t lvb[PR_DLM_LVB_LEN];
/* SPC-2 reservation state information. */
uint32_t reserved_by_nodeid;
};
/**
* struct pr_lvb - PR_DATA_LOCK LVB data format
* @nr_registrants: number of reservation keys that have been registered
* @pr_generation: persistent reservation generation
* @version: version of this structure
* @pr_is_set: whether the device has been reserved persistently
* @pr_type: persistent reservation type
* @pr_scope: persistent reservation scope
* @pr_aptpl: persistent reservation APTPL
* @reserved_by_nodeid: Corosync node ID of the node holding an SPC-2
* reservation. Zero if no SPC-2 reservation is held.
*/
struct pr_lvb {
__be32 nr_registrants;
__be32 pr_generation;
u8 version;
u8 pr_is_set;
u8 pr_type;
u8 pr_scope;
u8 pr_aptpl;
u8 reserved[3];
__be32 reserved_by_nodeid;
};
/**
* struct pr_reg_lvb - PR_REG_LOCK LVB data format
* @key: reservation key
* @rel_tgt_id: relative target id
* @version: version of this structure
* @is_holder: whether or not holding the reservation
* @tid: transport ID - up to 228 bytes for iSCSI
*/
struct pr_reg_lvb {
__be64 key;
__be16 rel_tgt_id;
u8 version;
u8 is_holder;
u8 tid[228];
};
#endif /* __SCST_PRES_DLM_H */

View File

@@ -6090,17 +6090,18 @@ out:
static void scst_clear_reservation(struct scst_tgt_dev *tgt_dev)
{
struct scst_device *dev = tgt_dev->dev;
struct scst_lksb pr_lksb;
int release = 0;
TRACE_ENTRY();
spin_lock_bh(&dev->dev_lock);
scst_res_lock(dev, &pr_lksb);
if (scst_is_reservation_holder(dev, tgt_dev->sess)) {
/* This is one who holds the reservation */
scst_clear_dev_reservation(dev);
release = 1;
}
spin_unlock_bh(&dev->dev_lock);
scst_res_unlock(dev, &pr_lksb);
if (release)
scst_send_release(dev);
@@ -11729,11 +11730,12 @@ static bool __scst_dev_check_set_UA(struct scst_device *dev,
void scst_dev_check_set_UA(struct scst_device *dev,
struct scst_cmd *exclude, const uint8_t *sense, int sense_len)
{
struct scst_lksb lksb;
bool rc;
spin_lock_bh(&dev->dev_lock);
scst_res_lock(dev, &lksb);
rc = __scst_dev_check_set_UA(dev, exclude, sense, sense_len);
spin_unlock_bh(&dev->dev_lock);
scst_res_unlock(dev, &lksb);
if (rc)
scst_unblock_aborted_cmds(NULL, NULL, dev, false);
@@ -12480,6 +12482,7 @@ void scst_reassign_retained_sess_states(struct scst_session *new_sess,
list_for_each_entry(dev, &scst_dev_list, dev_list_entry) {
struct scst_tgt_dev *tgt_dev;
struct scst_tgt_dev *new_tgt_dev = NULL, *old_tgt_dev = NULL;
struct scst_lksb pr_lksb;
TRACE_DBG("Processing dev %s", dev->virt_name);
@@ -12506,11 +12509,13 @@ void scst_reassign_retained_sess_states(struct scst_session *new_sess,
/** Reassign regular reservations **/
scst_res_lock(dev, &pr_lksb);
if (scst_is_reservation_holder(dev, old_sess)) {
scst_reserve_dev(dev, new_sess);
TRACE_DBG("Reservation reassigned from old_tgt_dev %p "
"to new_tgt_dev %p", old_tgt_dev, new_tgt_dev);
}
scst_res_unlock(dev, &pr_lksb);
/** Reassign PRs **/

118
scst/src/scst_no_dlm.c Normal file
View File

@@ -0,0 +1,118 @@
/*
* Copyright (c) 2013 - 2014 Fusion-io, Inc. All rights reserved.
* Copyright (C) 2014 - 2015 SanDisk Corporation.
*
* Synchronization framework of persistent registration data without DLM lock.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation, version 2
* of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*/
#ifdef INSIDE_KERNEL_TREE
#include <scst/scst.h>
#include <scst/scst_const.h>
#else
#include "scst.h"
#include "scst_const.h"
#endif
#include "scst_priv.h"
#include "scst_pres.h"
static int scst_no_dlm_pr_init(struct scst_device *dev, const char *cl_dev_id)
{
return 0;
}
static void scst_no_dlm_pr_cleanup(struct scst_device *dev)
{
}
static bool scst_no_dlm_pr_is_set(struct scst_device *dev)
{
return dev->pr_is_set;
}
static void scst_no_dlm_pr_init_reg(struct scst_device *dev,
struct scst_dev_registrant *reg)
{
}
static void scst_no_dlm_pr_rm_reg(struct scst_device *dev,
struct scst_dev_registrant *reg)
{
}
static void scst_no_dlm_pr_write_lock(struct scst_device *dev,
struct scst_lksb *pr_lksb)
{
scst_pr_write_lock(dev);
}
static void scst_no_dlm_pr_write_unlock(struct scst_device *dev,
struct scst_lksb *pr_lksb)
{
scst_pr_write_unlock(dev);
}
static bool scst_no_dlm_reserved(struct scst_device *dev)
{
return dev->reserved_by;
}
static void scst_no_dlm_res_lock(struct scst_device *dev,
struct scst_lksb *pr_lksb)
__acquires(&dev->dev_lock)
{
EXTRACHECKS_BUG_ON(in_irq() || irqs_disabled());
spin_lock_bh(&dev->dev_lock);
}
static void scst_no_dlm_res_unlock(struct scst_device *dev,
struct scst_lksb *pr_lksb)
__releases(&dev->dev_lock)
{
spin_unlock_bh(&dev->dev_lock);
}
static bool scst_no_dlm_is_rsv_holder(struct scst_device *dev,
struct scst_session *sess)
{
EXTRACHECKS_BUG_ON(sess == NULL);
return dev->reserved_by == sess;
}
static bool scst_no_dlm_is_not_rsv_holder(struct scst_device *dev,
struct scst_session *sess)
{
EXTRACHECKS_BUG_ON(sess == NULL);
return dev->reserved_by && dev->reserved_by != sess;
}
static void scst_no_dlm_reserve(struct scst_device *dev,
struct scst_session *sess)
{
dev->reserved_by = sess;
}
const struct scst_cl_ops scst_no_dlm_cl_ops = {
.pr_init = scst_no_dlm_pr_init,
.pr_cleanup = scst_no_dlm_pr_cleanup,
.pr_is_set = scst_no_dlm_pr_is_set,
.pr_init_reg = scst_no_dlm_pr_init_reg,
.pr_rm_reg = scst_no_dlm_pr_rm_reg,
.pr_write_lock = scst_no_dlm_pr_write_lock,
.pr_write_unlock = scst_no_dlm_pr_write_unlock,
.reserved = scst_no_dlm_reserved,
.res_lock = scst_no_dlm_res_lock,
.res_unlock = scst_no_dlm_res_unlock,
.is_rsv_holder = scst_no_dlm_is_rsv_holder,
.is_not_rsv_holder = scst_no_dlm_is_not_rsv_holder,
.reserve = scst_no_dlm_reserve,
};

View File

@@ -88,7 +88,7 @@ static inline void scst_assert_pr_mutex_held(struct scst_device *dev)
}
#endif
static inline int scst_tid_size(const uint8_t *tid)
int scst_tid_size(const uint8_t *tid)
{
sBUG_ON(tid == NULL);
@@ -185,7 +185,7 @@ out_error:
}
/* Must be called under dev_pr_mutex */
static inline void scst_pr_set_holder(struct scst_device *dev,
void scst_pr_set_holder(struct scst_device *dev,
struct scst_dev_registrant *holder, uint8_t scope, uint8_t type)
{
scst_assert_pr_mutex_held(dev);
@@ -336,7 +336,7 @@ static void scst_pr_find_registrants_list_key(struct scst_device *dev,
}
/* dev_pr_mutex must be locked */
static struct scst_dev_registrant *scst_pr_find_reg(
struct scst_dev_registrant *scst_pr_find_reg(
struct scst_device *dev, const uint8_t *transport_id,
const uint16_t rel_tgt_id)
{
@@ -379,7 +379,7 @@ static void scst_pr_clear_reservation(struct scst_device *dev)
}
/* Must be called under dev_pr_mutex */
static void scst_pr_clear_holder(struct scst_device *dev)
void scst_pr_clear_holder(struct scst_device *dev)
{
TRACE_ENTRY();
@@ -401,7 +401,7 @@ static void scst_pr_clear_holder(struct scst_device *dev)
}
/* Must be called under dev_pr_mutex */
static struct scst_dev_registrant *scst_pr_add_registrant(
struct scst_dev_registrant *scst_pr_add_registrant(
struct scst_device *dev, const uint8_t *transport_id,
const uint16_t rel_tgt_id, __be64 key,
bool dev_lock_locked)
@@ -441,6 +441,8 @@ static struct scst_dev_registrant *scst_pr_add_registrant(
goto out;
}
dev->cl_ops->pr_init_reg(dev, reg);
reg->transport_id = kmemdup(transport_id, scst_tid_size(transport_id),
gfp_flags);
if (reg->transport_id == NULL) {
@@ -496,7 +498,7 @@ out_free:
}
/* Must be called under dev_pr_mutex */
static void scst_pr_remove_registrant(struct scst_device *dev,
void scst_pr_remove_registrant(struct scst_device *dev,
struct scst_dev_registrant *reg)
{
TRACE_ENTRY();
@@ -510,6 +512,8 @@ static void scst_pr_remove_registrant(struct scst_device *dev,
list_del(&reg->dev_registrants_list_entry);
dev->cl_ops->pr_rm_reg(dev, reg);
if (scst_pr_is_holder(dev, reg))
scst_pr_clear_holder(dev);
@@ -1142,6 +1146,7 @@ out:
int scst_pr_init(struct scst_device *dev)
{
mutex_init(&dev->dev_pr_mutex);
dev->cl_ops = &scst_no_dlm_cl_ops;
dev->pr_generation = 0;
dev->pr_is_set = 0;
dev->pr_holder = NULL;
@@ -1155,8 +1160,41 @@ int scst_pr_init(struct scst_device *dev)
/* Free the resources allocated by scst_pr_init(). */
void scst_pr_cleanup(struct scst_device *dev)
{
dev->cl_ops->pr_cleanup(dev);
}
/* Caller must hold scst_mutex and activity must be suspended. */
int scst_pr_set_cluster_mode(struct scst_device *dev, bool cluster_mode,
const char *cl_dev_id)
{
bool cluster_mode_enabled = false;
int res = 0;
#if defined(CONFIG_DLM) || defined(CONFIG_DLM_MODULE)
cluster_mode_enabled = dev->cl_ops == &scst_dlm_cl_ops;
if (cluster_mode_enabled == cluster_mode)
goto out;
PRINT_INFO("%s: changing cluster_mode from %d into %d", dev->virt_name,
cluster_mode_enabled, cluster_mode);
dev->cl_ops->pr_cleanup(dev);
dev->cl_ops = cluster_mode ? &scst_dlm_cl_ops : &scst_no_dlm_cl_ops;
res = dev->cl_ops->pr_init(dev, cl_dev_id);
if (res) {
PRINT_ERROR("%s: changing cluster_mode into %d failed: %d",
dev->virt_name, cluster_mode, res);
dev->cl_ops = &scst_no_dlm_cl_ops;
}
#else
res = cluster_mode ? -ENOTSUPP : 0;
#endif
out:
return res;
}
EXPORT_SYMBOL(scst_pr_set_cluster_mode);
/* Must be called under dev_pr_mutex or before dev is on the device list. */
int scst_pr_init_dev(struct scst_device *dev)
{

View File

@@ -79,6 +79,11 @@ static inline void scst_pr_read_unlock(struct scst_device *dev)
mutex_unlock(&dev->dev_pr_mutex);
}
static inline void lockdep_assert_pr_read_lock_held(struct scst_device *dev)
{
lockdep_assert_held(&dev->dev_pr_mutex);
}
static inline void scst_pr_write_lock(struct scst_device *dev)
{
mutex_lock(&dev->dev_pr_mutex);
@@ -89,6 +94,11 @@ static inline void scst_pr_write_unlock(struct scst_device *dev)
mutex_unlock(&dev->dev_pr_mutex);
}
static inline void lockdep_assert_pr_write_lock_held(struct scst_device *dev)
{
lockdep_assert_held(&dev->dev_pr_mutex);
}
int scst_pr_set_file_name(struct scst_device *dev, char **prev,
const char *fmt, ...) __printf(3, 4);
@@ -120,6 +130,21 @@ void scst_pr_report_caps(struct scst_cmd *cmd, uint8_t *buffer, int buffer_size)
void scst_pr_read_full_status(struct scst_cmd *cmd, uint8_t *buffer,
int buffer_size);
int scst_tid_size(const uint8_t *tid);
struct scst_dev_registrant *scst_pr_find_reg(struct scst_device *dev,
const uint8_t *transport_id, const uint16_t rel_tgt_id);
struct scst_dev_registrant *scst_pr_add_registrant(struct scst_device *dev,
const uint8_t *transport_id,
const uint16_t rel_tgt_id,
__be64 key,
bool dev_lock_locked);
void scst_pr_remove_registrant(struct scst_device *dev,
struct scst_dev_registrant *reg);
void scst_pr_set_holder(struct scst_device *dev,
struct scst_dev_registrant *holder, uint8_t scope,
uint8_t type);
void scst_pr_clear_holder(struct scst_device *dev);
#ifndef CONFIG_SCST_PROC
void scst_pr_sync_device_file(struct scst_tgt_dev *tgt_dev, struct scst_cmd *cmd);
#endif

View File

@@ -170,6 +170,9 @@ extern struct list_head scst_dev_type_list;
extern struct list_head scst_virtual_dev_type_list;
extern wait_queue_head_t scst_dev_cmd_waitQ;
extern const struct scst_cl_ops scst_no_dlm_cl_ops;
extern const struct scst_cl_ops scst_dlm_cl_ops;
#ifdef CONFIG_SCST_PROC
extern struct list_head scst_acg_list;
extern struct scst_acg *scst_default_acg;
@@ -398,6 +401,19 @@ void scst_free_session_callback(struct scst_session *sess);
void scst_check_retries(struct scst_tgt *tgt);
static inline int scst_dlm_new_lockspace(const char *name, int namelen,
dlm_lockspace_t **lockspace,
uint32_t flags,
int lvblen)
{
#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 3, 0)
return dlm_new_lockspace(name, namelen, lockspace, flags, lvblen);
#else
return dlm_new_lockspace(name, NULL, flags, lvblen, NULL, NULL, NULL,
lockspace);
#endif
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 30)
static inline int scst_exec_req(struct scsi_device *sdev,
const unsigned char *cmd, int cmd_len, int data_direction,
@@ -599,7 +615,23 @@ void scst_acn_sysfs_del(struct scst_acn *acn);
*/
static inline bool scst_dev_reserved(struct scst_device *dev)
{
return dev->reserved_by;
return dev->cl_ops->reserved(dev);
}
/* Protect SPC-2 reservation state against concurrent modifications. */
static inline void scst_res_lock(struct scst_device *dev,
struct scst_lksb *pr_lksb)
__acquires(&dev->dev_lock)
{
dev->cl_ops->res_lock(dev, pr_lksb);
}
static inline void scst_res_unlock(struct scst_device *dev,
struct scst_lksb *pr_lksb)
__releases(&dev->dev_lock)
{
dev->cl_ops->res_unlock(dev, pr_lksb);
}
/*
@@ -610,7 +642,7 @@ static inline bool scst_is_reservation_holder(struct scst_device *dev,
struct scst_session *sess)
{
EXTRACHECKS_BUG_ON(sess == NULL);
return dev->reserved_by == sess;
return dev->cl_ops->is_rsv_holder(dev, sess);
}
/*
@@ -620,23 +652,22 @@ static inline bool scst_is_reservation_holder(struct scst_device *dev,
static inline bool scst_is_not_reservation_holder(struct scst_device *dev,
struct scst_session *sess)
{
struct scst_session *reserved_by = dev->reserved_by;
EXTRACHECKS_BUG_ON(sess == NULL);
return reserved_by != NULL && reserved_by != sess;
return dev->cl_ops->is_not_rsv_holder(dev, sess);
}
static inline void scst_reserve_dev(struct scst_device *dev,
struct scst_session *sess)
{
lockdep_assert_held(&dev->dev_lock);
EXTRACHECKS_BUG_ON(sess == NULL);
dev->reserved_by = sess;
dev->cl_ops->reserve(dev, sess);
}
static inline void scst_clear_dev_reservation(struct scst_device *dev)
{
lockdep_assert_held(&dev->dev_lock);
dev->reserved_by = NULL;
dev->cl_ops->reserve(dev, NULL);
}
void scst_tgt_dev_del_free_UA(struct scst_tgt_dev *tgt_dev,

View File

@@ -3139,12 +3139,16 @@ static ssize_t scst_dev_sysfs_pr_file_name_store(struct kobject *kobj,
{
struct scst_sysfs_work_item *work;
struct scst_device *dev;
char *pr_file_name, *p;
int res = -ENOMEM;
char *pr_file_name = NULL, *p;
int res = -EPERM;
bool def = false;
dev = container_of(kobj, struct scst_device, dev_kobj);
if (dev->cluster_mode)
goto out;
res = -ENOMEM;
pr_file_name = kasprintf(GFP_KERNEL, "%.*s", (int)count, buf);
if (!pr_file_name) {
PRINT_ERROR("Unable to kasprintf() PR file name");

View File

@@ -2379,6 +2379,7 @@ static int scst_reserve_local(struct scst_cmd *cmd)
{
int res = SCST_EXEC_NOT_COMPLETED;
struct scst_device *dev;
struct scst_lksb pr_lksb;
TRACE_ENTRY();
@@ -2419,14 +2420,14 @@ static int scst_reserve_local(struct scst_cmd *cmd)
}
}
spin_lock_bh(&dev->dev_lock);
scst_res_lock(dev, &pr_lksb);
if (scst_is_not_reservation_holder(dev, cmd->sess)) {
spin_unlock_bh(&dev->dev_lock);
scst_res_unlock(dev, &pr_lksb);
scst_set_cmd_error_status(cmd, SAM_STAT_RESERVATION_CONFLICT);
goto out_done;
}
scst_reserve_dev(dev, cmd->sess);
spin_unlock_bh(&dev->dev_lock);
scst_res_unlock(dev, &pr_lksb);
out:
TRACE_EXIT_RES(res);
@@ -2446,6 +2447,7 @@ static int scst_release_local(struct scst_cmd *cmd)
{
int res = SCST_EXEC_NOT_COMPLETED;
struct scst_device *dev;
struct scst_lksb pr_lksb;
TRACE_ENTRY();
@@ -2466,7 +2468,7 @@ static int scst_release_local(struct scst_cmd *cmd)
}
}
spin_lock_bh(&dev->dev_lock);
scst_res_lock(dev, &pr_lksb);
/*
* The device could be RELEASED behind us, if RESERVING session
@@ -2489,7 +2491,7 @@ static int scst_release_local(struct scst_cmd *cmd)
scst_clear_dev_reservation(dev);
}
spin_unlock_bh(&dev->dev_lock);
scst_res_unlock(dev, &pr_lksb);
if (res == SCST_EXEC_COMPLETED)
goto out_done;
@@ -2616,6 +2618,7 @@ static int scst_persistent_reserve_out_local(struct scst_cmd *cmd)
int action;
uint8_t *buffer;
int buffer_size;
struct scst_lksb pr_lksb;
bool aborted = false;
TRACE_ENTRY();
@@ -2651,7 +2654,7 @@ static int scst_persistent_reserve_out_local(struct scst_cmd *cmd)
if (unlikely(buffer_size <= 0))
goto out_done;
scst_pr_write_lock(dev);
dev->cl_ops->pr_write_lock(dev, &pr_lksb);
/*
* Check if tgt_dev already registered. Also by this check we make
@@ -2742,7 +2745,7 @@ static int scst_persistent_reserve_out_local(struct scst_cmd *cmd)
res = SCST_EXEC_NOT_COMPLETED;
out_unlock:
scst_pr_write_unlock(dev);
dev->cl_ops->pr_write_unlock(dev, &pr_lksb);
scst_put_buf_full(cmd, buffer);
@@ -2816,7 +2819,7 @@ int __scst_check_local_events(struct scst_cmd *cmd, bool preempt_tests_only)
}
if (!preempt_tests_only) {
if (dev->pr_is_set) {
if (dev->cl_ops->pr_is_set(dev)) {
if (unlikely(!scst_pr_is_cmd_allowed(cmd))) {
scst_set_cmd_error_status(cmd,
SAM_STAT_RESERVATION_CONFLICT);
@@ -5980,14 +5983,15 @@ static int scst_target_reset(struct scst_mgmt_cmd *mcmd)
list_for_each_entry(acg_dev, &acg->acg_dev_list, acg_dev_list_entry) {
struct scst_device *d;
struct scst_tgt_dev *tgt_dev;
struct scst_lksb pr_lksb;
int found = 0;
dev = acg_dev->dev;
spin_lock_bh(&dev->dev_lock);
scst_res_lock(dev, &pr_lksb);
scst_block_dev(dev);
scst_process_reset(dev, mcmd->sess, NULL, mcmd, true);
spin_unlock_bh(&dev->dev_lock);
scst_res_unlock(dev, &pr_lksb);
list_for_each_entry(tgt_dev, &dev->dev_tgt_dev_list,
dev_tgt_dev_list_entry) {
@@ -6076,6 +6080,7 @@ static int scst_lun_reset(struct scst_mgmt_cmd *mcmd)
int res, rc;
struct scst_tgt_dev *tgt_dev = mcmd->mcmd_tgt_dev;
struct scst_device *dev = tgt_dev->dev;
struct scst_lksb pr_lksb;
TRACE_ENTRY();
@@ -6084,10 +6089,10 @@ static int scst_lun_reset(struct scst_mgmt_cmd *mcmd)
mcmd->needs_unblocking = 1;
spin_lock_bh(&dev->dev_lock);
scst_res_lock(dev, &pr_lksb);
scst_block_dev(dev);
scst_process_reset(dev, mcmd->sess, NULL, mcmd, true);
spin_unlock_bh(&dev->dev_lock);
scst_res_unlock(dev, &pr_lksb);
scst_call_dev_task_mgmt_fn_received(mcmd, tgt_dev);