- Memory barriers cleanup. Comments for them improved

- Small docs update - srpt/README_in-tree added git-svn-id: http://svn.code.sf.net/p/scst/svn/trunk@614 d57e44dd-8a1f-0410-8b47-8ef2f437770f
2026-05-14 09:11:27 +00:00 · 2008-12-10 10:55:01 +00:00
parent 76d0a69737
commit 41c3515516
12 changed files with 174 additions and 25 deletions
--- a/iscsi-scst/kernel/iscsi.c
+++ b/iscsi-scst/kernel/iscsi.c
@@ -2512,7 +2512,9 @@ static inline void iscsi_set_state_wake_up(struct iscsi_cmnd *req,
 	 * We use wait_event() to wait for the state change, but it checks its
 	 * condition without any protection, so without cmnd_get() it is
 	 * possible that req will die "immediately" after the state assignment
-	 * and wake_up() will operate on dead data.
+	 * and wake_up() will operate on dead data. We use the ordered version
+	 * of cmnd_get(), because "get" must be done before the state
+	 * assignment.
 	 */
 	cmnd_get_ordered(req);
 	req->scst_state = new_state;
@@ -2710,6 +2712,12 @@ static int iscsi_xmit_response(struct scst_cmd *scst_cmd)
 		sBUG();
 #endif

+	/*
+	 * "_ordered" here to protect from reorder, which can lead to
+	 * preliminary connection destroy in req_cmnd_release(). Just in
+	 * case, actually, because reordering shouldn't go so far, but who
+	 * knows..
+	 */
 	conn_get_ordered(conn);
 	req_cmnd_release(req);
 	iscsi_try_local_processing(conn);
--- a/iscsi-scst/kernel/iscsi.h
+++ b/iscsi-scst/kernel/iscsi.h
@@ -475,6 +475,7 @@ static inline void cmnd_get(struct iscsi_cmnd *cmnd)
 static inline void cmnd_get_ordered(struct iscsi_cmnd *cmnd)
 {
 	cmnd_get(cmnd);
+	/* See comments for each cmnd_get_ordered() use */
 	smp_mb__after_atomic_inc();
 }

@@ -546,6 +547,7 @@ static inline void conn_get(struct iscsi_conn *conn)
 static inline void conn_get_ordered(struct iscsi_conn *conn)
 {
 	conn_get(conn);
+	/* See comments for each conn_get_ordered() use */
 	smp_mb__after_atomic_inc();
 }

@@ -556,8 +558,9 @@ static inline void conn_put(struct iscsi_conn *conn)
 	sBUG_ON(atomic_read(&conn->conn_ref_cnt) == 0);

 	/*
-	 * It always ordered to protect from undesired side effects like
-	 * accessing just destroyed object because of this *_dec() reordering.
+	 * Make it always ordered to protect from undesired side effects like
+	 * accessing just destroyed by close_conn() conn caused by reordering
+	 * of this atomic_dec().
 	 */
 	smp_mb__before_atomic_dec();
 	atomic_dec(&conn->conn_ref_cnt);
--- a/scripts/generate-kernel-patch
+++ b/scripts/generate-kernel-patch
@@ -359,7 +359,7 @@ else
 fi \
 | process_patch "srpt.diff"

-add_file "srpt/README" "Documentation/scst/README.srpt" \
+add_file "srpt/README_in-tree" "Documentation/scst/README.srpt" \
 | process_patch "srpt-doc.diff"


--- a/scst/README
+++ b/scst/README
@@ -274,6 +274,8 @@ For changing VMSPLIT option (CONFIG_VMSPLIT to be precise) you should in

 - General setup->Configure standard kernel features (for small systems): ON

+ - General setup->Prompt for development and/or incomplete code/drivers: ON
+
 - Processor type and features->High Memory Support: OFF

 - Processor type and features->Memory split: according to amount of
--- a/scst/README_in-tree
+++ b/scst/README_in-tree
@@ -222,6 +222,8 @@ For changing VMSPLIT option (CONFIG_VMSPLIT to be precise) you should in

 - General setup->Configure standard kernel features (for small systems): ON

+ - General setup->Prompt for development and/or incomplete code/drivers: ON
+
 - Processor type and features->High Memory Support: OFF

 - Processor type and features->Memory split: according to amount of
--- a/scst/src/dev_handlers/scst_user.c
+++ b/scst/src/dev_handlers/scst_user.c
@@ -1270,6 +1270,11 @@ static int dev_user_process_reply_exec(struct scst_user_cmd *ucmd,
 		if (unlikely((cmd->data_direction == SCST_DATA_READ) ||
 			     (cmd->resp_data_len != 0)))
 			goto out_inval;
+		/*
+		 * background_exec assignment must be after ucmd get.
+		 * Otherwise, due to reorder, in dev_user_process_reply()
+		 * it is possible that ucmd is destroyed before it "got" here.
+		 */
 		ucmd_get_ordered(ucmd);
 		ucmd->background_exec = 1;
 		TRACE_DBG("Background ucmd %p", ucmd);
@@ -1383,6 +1388,8 @@ static int dev_user_process_reply(struct scst_user_dev *dev,
 		goto out_unlock;
 	}

+	/* To sync. with dev_user_process_reply_exec(). See comment there. */
+	smp_mb();
 	if (ucmd->background_exec) {
 		state = UCMD_STATE_EXECING;
 		goto unlock_process;
--- a/scst/src/dev_handlers/scst_vdisk.c
+++ b/scst/src/dev_handlers/scst_vdisk.c
@@ -2450,7 +2450,6 @@ static void blockio_exec_rw(struct scst_cmd *cmd, struct scst_vdisk_thr *thr,

 	/* +1 to prevent erroneous too early command completion */
 	atomic_set(&blockio_work->bios_inflight, bios+1);
-	smp_wmb();

 	while (hbio) {
 		bio = hbio;
--- a/scst/src/scst_lib.c
+++ b/scst/src/scst_lib.c
@@ -1528,9 +1528,10 @@ void scst_check_retries(struct scst_tgt *tgt)

 	/*
 	 * We don't worry about overflow of finished_cmds, because we check
-	 * only for its change
+	 * only for its change.
 	 */
 	atomic_inc(&tgt->finished_cmds);
+	/* See comment in scst_queue_retry_cmd() */
 	smp_mb__after_atomic_inc();
 	if (unlikely(tgt->retry_cmds > 0)) {
 		struct scst_cmd *c, *tc;
@@ -1541,8 +1542,7 @@ void scst_check_retries(struct scst_tgt *tgt)

 		spin_lock_irqsave(&tgt->tgt_lock, flags);
 		list_for_each_entry_safe(c, tc, &tgt->retry_cmd_list,
-				cmd_list_entry)
-		{
+				cmd_list_entry) {
 			tgt->retry_cmds--;

 			TRACE_RETRY("Moving retry cmd %p to head of active "
@@ -3125,7 +3125,13 @@ static void scst_block_dev(struct scst_device *dev, int outstanding)
 	__scst_block_dev(dev);
 	spin_unlock_bh(&dev->dev_lock);

-	/* spin_unlock_bh() doesn't provide the necessary memory barrier */
+	/*
+	 * Memory barrier is necessary here, because we need to read
+	 * on_dev_count in wait_event() below after we increased block_count.
+	 * Otherwise, we can miss wake up in scst_dec_on_dev_cmd().
+	 * We use the explicit barrier, because spin_unlock_bh() doesn't
+	 * provide the necessary memory barrier functionality.
+	 */
 	smp_mb();

 	TRACE_MGMT_DBG("Waiting during blocking outstanding %d (on_dev_count "
@@ -3211,7 +3217,6 @@ repeat:
 		spin_lock_bh(&dev->dev_lock);
 		if (unlikely(test_bit(SCST_CMD_ABORTED, &cmd->cmd_flags)))
 			goto out_unlock;
-		barrier(); /* to reread block_count */
 		if (dev->block_count > 0) {
 			scst_dec_on_dev_cmd(cmd);
 			TRACE_MGMT_DBG("Delaying cmd %p due to blocking or "
@@ -3230,7 +3235,6 @@ repeat:
 	}
 	if (unlikely(dev->dev_serialized)) {
 		spin_lock_bh(&dev->dev_lock);
-		barrier(); /* to reread block_count */
 		if (dev->block_count == 0) {
 			TRACE_MGMT_DBG("cmd %p (tag %llu), blocking further "
 				"cmds due to serializing (dev %p)", cmd,
@@ -3405,7 +3409,6 @@ void scst_xmit_process_aborted_cmd(struct scst_cmd *cmd)

 	scst_done_cmd_mgmt(cmd);

-	smp_rmb();
 	if (test_bit(SCST_CMD_ABORTED_OTHER, &cmd->cmd_flags)) {
 		if (cmd->completed) {
 			/* It's completed and it's OK to return its result */
@@ -3548,6 +3551,7 @@ static void tm_dbg_timer_fn(unsigned long arg)
 {
 	TRACE_MGMT_DBG("%s", "delayed cmd timer expired");
 	tm_dbg_flags.tm_dbg_release = 1;
+	/* Used to make sure that all woken up threads see the new value */
 	smp_wmb();
 	wake_up_all(tm_dbg_p_cmd_list_waitQ);
 }
@@ -3763,6 +3767,10 @@ void tm_dbg_task_mgmt(struct scst_device *dev, const char *fn, int force)
 			tm_dbg_delayed_cmds_count);
 		tm_dbg_change_state();
 		tm_dbg_flags.tm_dbg_release = 1;
+		/*
+		 * Used to make sure that all woken up threads see the new
+		 * value.
+		 */
 		smp_wmb();
 		if (tm_dbg_p_cmd_list_waitQ != NULL)
 			wake_up_all(tm_dbg_p_cmd_list_waitQ);
--- a/scst/src/scst_main.c
+++ b/scst/src/scst_main.c
@@ -501,13 +501,18 @@ int scst_suspend_activity(bool interruptible)

 	set_bit(SCST_FLAG_SUSPENDING, &scst_flags);
 	set_bit(SCST_FLAG_SUSPENDED, &scst_flags);
+	/*
+	 * Assignment of SCST_FLAG_SUSPENDING and SCST_FLAG_SUSPENDED must be
+	 * ordered with scst_cmd_count. Otherwise lockless logic in
+	 * scst_translate_lun() and scst_mgmt_translate_lun() won't work.
+	 */
 	smp_mb__after_set_bit();

 	/*
 	 * See comment in scst_user.c::dev_user_task_mgmt_fn() for more
 	 * information about scst_user behavior.
 	 *
-	 * ToDo: make the global suspending unneeded (Switch to per-device
+	 * ToDo: make the global suspending unneeded (switch to per-device
 	 * reference counting? That would mean to switch off from lockless
 	 * implementation of scst_translate_lun().. )
 	 */
@@ -530,6 +535,7 @@ int scst_suspend_activity(bool interruptible)
 		goto out_clear;

 	clear_bit(SCST_FLAG_SUSPENDING, &scst_flags);
+	/* See comment about smp_mb() above */
 	smp_mb__after_clear_bit();

 	TRACE_MGMT_DBG("Waiting for %d active commands finally to complete",
@@ -551,6 +557,7 @@ out:

 out_clear:
 	clear_bit(SCST_FLAG_SUSPENDING, &scst_flags);
+	/* See comment about smp_mb() above */
 	smp_mb__after_clear_bit();
 	goto out_up;
 }
@@ -568,6 +575,10 @@ static void __scst_resume_activity(void)
 		goto out;

 	clear_bit(SCST_FLAG_SUSPENDED, &scst_flags);
+	/*
+	 * The barrier is needed to make sure all woken up threads see the
+	 * cleared flag. Not sure if it's really needed, but let's be safe.
+	 */
 	smp_mb__after_clear_bit();

 	list_for_each_entry(l, &scst_cmd_lists_list, lists_list_entry) {
--- a/scst/src/scst_priv.h
+++ b/scst/src/scst_priv.h
@@ -420,6 +420,7 @@ static inline void scst_dec_on_dev_cmd(struct scst_cmd *cmd)
 		scst_unblock_dev(dev);

 	atomic_dec(&dev->on_dev_count);
+	/* See comment in scst_block_dev() */
 	smp_mb__after_atomic_dec();

 	TRACE_DBG("New on_dev_count %d", atomic_read(&dev->on_dev_count));
@@ -438,6 +439,7 @@ static inline void __scst_get(int barrier)
 	TRACE_DBG("Incrementing scst_cmd_count(%d)",
 		atomic_read(&scst_cmd_count));

+	/* See comment about smp_mb() in scst_suspend_activity() */
 	if (barrier)
 		smp_mb__after_atomic_inc();
 }
@@ -446,6 +448,7 @@ static inline void __scst_put(void)
 {
 	int f;
 	f = atomic_dec_and_test(&scst_cmd_count);
+	/* See comment about smp_mb() in scst_suspend_activity() */
 	if (f && unlikely(test_bit(SCST_FLAG_SUSPENDED, &scst_flags))) {
 		TRACE_MGMT_DBG("%s", "Waking up scst_dev_cmd_waitQ");
 		wake_up_all(&scst_dev_cmd_waitQ);
--- a/scst/src/scst_targ.c
+++ b/scst/src/scst_targ.c
@@ -120,7 +120,8 @@ static int scst_init_cmd(struct scst_cmd *cmd, enum scst_exec_context *context)
 	}
 	/*
 	 * Memory barrier isn't necessary here, because CPU appears to
-	 * be self-consistent
+	 * be self-consistent and we don't care about the race, described
+	 * in comment in scst_do_job_init().
 	 */

 	rc = __scst_init_cmd(cmd);
@@ -885,6 +886,12 @@ static int scst_queue_retry_cmd(struct scst_cmd *cmd, int finished_cmds)

 	spin_lock_irqsave(&tgt->tgt_lock, flags);
 	tgt->retry_cmds++;
+	/*
+	 * Memory barrier is needed here, because we need the exact order
+	 * between the read and write between retry_cmds and finished_cmds to
+	 * not miss the case when a command finished while we queuing it for
+	 * retry after the finished_cmds check.
+	 */
 	smp_mb();
 	TRACE_RETRY("TGT QUEUE FULL: incrementing retry_cmds %d",
 	      tgt->retry_cmds);
@@ -1684,7 +1691,6 @@ int scst_check_local_events(struct scst_cmd *cmd)
 			 * was_reset.
 			 */
 			spin_lock_bh(&dev->dev_lock);
-			barrier(); /* to reread was_reset */
 			if (dev->scsi_dev->was_reset) {
 				TRACE(TRACE_MGMT, "was_reset is %d", 1);
 				scst_set_cmd_error(cmd,
@@ -1764,12 +1770,16 @@ void scst_inc_expected_sn(struct scst_tgt_dev *tgt_dev, atomic_t *slot)

 inc:
 	/*
-	 * No locks is needed, because only one thread at time can
-	 * be here (serialized by sn). Also it is supposed that there
-	 * could not be half-incremented halves.
+	 * No protection of expected_sn is needed, because only one thread
+	 * at time can be here (serialized by sn). Also it is supposed that
+	 * there could not be half-incremented halves.
 	 */
 	tgt_dev->expected_sn++;
-	smp_mb(); /* write must be before def_cmd_count read */
+	/*
+	 * Write must be before def_cmd_count read to be in sync. with
+	 * scst_post_exec_sn(). See comment in scst_send_for_exec().
+	 */
+	smp_mb(); 
 	TRACE_SN("Next expected_sn: %ld", tgt_dev->expected_sn);

 out:
@@ -2168,6 +2178,17 @@ static int scst_send_for_exec(struct scst_cmd **active_cmd)
 		spin_lock_irq(&tgt_dev->sn_lock);

 		tgt_dev->def_cmd_count++;
+		/*
+		 * Memory barrier is needed here to implement lockless fast
+		 * path. We need the exact order of read and write between
+		 * def_cmd_count and expected_sn. Otherwise, we can miss case,
+		 * when expected_sn was changed to be equal to cmd->sn while
+		 * we are queuing cmd the deferred list after the expected_sn
+		 * below. It will lead to a forever stuck command. But with
+		 * the barrier in such case __scst_check_deferred_commands()
+		 * will be called and it will take sn_lock, so we will be
+		 * synchronized.
+		 */
 		smp_mb();

 		expected_sn = tgt_dev->expected_sn;
@@ -3044,6 +3065,7 @@ static int scst_translate_lun(struct scst_cmd *cmd)

 	TRACE_ENTRY();

+	/* See comment about smp_mb() in scst_suspend_activity() */
 	__scst_get(1);

 	if (likely(!test_bit(SCST_FLAG_SUSPENDED, &scst_flags))) {
@@ -3209,11 +3231,11 @@ restart:
 		 * it's inserting to it, but another command at the same time
 		 * seeing init cmd list empty and goes directly, because it
 		 * could affect only commands from the same initiator to the
-		 * same tgt_dev, but init_cmd_done() doesn't guarantee the order
-		 * in case of simultaneous such calls anyway.
+		 * same tgt_dev, but scst_cmd_init_done*() doesn't guarantee
+		 * the order in case of simultaneous such calls anyway.
 		 */
 		TRACE_MGMT_DBG("Deleting cmd %p from init cmd list", cmd);
-		smp_wmb();
+		smp_wmb(); /* enforce the required order */
 		list_del(&cmd->cmd_list_entry);
 		spin_unlock(&scst_init_lock);

@@ -3571,6 +3593,7 @@ static int scst_mgmt_translate_lun(struct scst_mgmt_cmd *mcmd)
 	TRACE_DBG("Finding tgt_dev for mgmt cmd %p (lun %lld)", mcmd,
 	      (long long unsigned int)mcmd->lun);

+	/* See comment about smp_mb() in scst_suspend_activity() */
 	__scst_get(1);

 	if (unlikely(test_bit(SCST_FLAG_SUSPENDED, &scst_flags) &&
@@ -3817,10 +3840,8 @@ void scst_abort_cmd(struct scst_cmd *cmd, struct scst_mgmt_cmd *mcmd,

 	if (other_ini) {
 		/* Might be necessary if command aborted several times */
-		if (!test_bit(SCST_CMD_ABORTED, &cmd->cmd_flags)) {
+		if (!test_bit(SCST_CMD_ABORTED, &cmd->cmd_flags))
 			set_bit(SCST_CMD_ABORTED_OTHER, &cmd->cmd_flags);
-			smp_mb__after_set_bit();
-		}
 	} else {
 		/* Might be necessary if command aborted several times */
 		clear_bit(SCST_CMD_ABORTED_OTHER, &cmd->cmd_flags);
--- a/srpt/README_in-tree
+++ b/srpt/README_in-tree
@@ -0,0 +1,85 @@
+SCSI RDMA Protocol (SRP) Target driver for Linux
+=================================================
+
+The SRP Target driver is designed to work directly on top of the
+OpenFabrics OFED-1.x software stack (http://www.openfabrics.org) or
+the Infiniband drivers in the Linux kernel tree
+(http://www.kernel.org). The SRP target driver also interfaces with
+the generic SCSI target mid-level driver called SCST
+(http://scst.sourceforge.net).
+
+How-to run
+-----------
+
+A. On srp target machine
+1. Please refer to SCST's README for loading scst driver and its
+dev_handlers drivers (scst_disk, scst_vdisk block or file IO mode, nullio, ...)
+
+Example 1: working with real back-end scsi disks
+a. modprobe scst
+b. modprobe scst_disk
+c. cat /proc/scsi_tgt/scsi_tgt
+
+ibstor00:~ # cat /proc/scsi_tgt/scsi_tgt
+Device (host:ch:id:lun or name)                             Device handler
+0:0:0:0                                                     dev_disk
+4:0:0:0                                                     dev_disk
+5:0:0:0                                                     dev_disk
+6:0:0:0                                                     dev_disk
+7:0:0:0                                                     dev_disk
+
+Now you want to exclude the first scsi disk and expose the last 4 scsi disks as
+IB/SRP luns for I/O
+echo "add 4:0:0:0 0" >/proc/scsi_tgt/groups/Default/devices
+echo "add 5:0:0:0 1" >/proc/scsi_tgt/groups/Default/devices
+echo "add 6:0:0:0 2" >/proc/scsi_tgt/groups/Default/devices
+echo "add 7:0:0:0 3" >/proc/scsi_tgt/groups/Default/devices
+
+Example 2: working with VDISK FILEIO mode (using md0 device and file 10G-file)
+a. modprobe scst
+b. modprobe scst_vdisk
+c. echo "open vdisk0 /dev/md0" > /proc/scsi_tgt/vdisk/vdisk
+d. echo "open vdisk1 /10G-file" > /proc/scsi_tgt/vdisk/vdisk
+e. echo "add vdisk0 0" >/proc/scsi_tgt/groups/Default/devices
+f. echo "add vdisk1 1" >/proc/scsi_tgt/groups/Default/devices
+
+Example 3: working with VDISK BLOCKIO mode (using md0 device, sda, and cciss/c1d0)
+a. modprobe scst
+b. modprobe scst_vdisk
+c. echo "open vdisk0 /dev/md0 BLOCKIO" > /proc/scsi_tgt/vdisk/vdisk
+d. echo "open vdisk1 /dev/sda BLOCKIO" > /proc/scsi_tgt/vdisk/vdisk
+e. echo "open vdisk2 /dev/cciss/c1d0 BLOCKIO" > /proc/scsi_tgt/vdisk/vdisk
+f. echo "add vdisk0 0" >/proc/scsi_tgt/groups/Default/devices
+g. echo "add vdisk1 1" >/proc/scsi_tgt/groups/Default/devices
+h. echo "add vdisk2 2" >/proc/scsi_tgt/groups/Default/devices
+
+2. modprobe ib_srpt
+
+
+B. On initiator machines you can manualy do the following steps:
+1. modprobe ib_srp
+2. ipsrpdm -c (to discover new SRP target)
+3. echo <new target info> > /sys/class/infiniband_srp/srp-mthca0-1/add_target
+4. fdisk -l (will show new discovered scsi disks)
+
+Example:
+Assume that you use port 1 of first HCA in the system ie. mthca0
+
+[root@lab104 ~]# ibsrpdm -c -d /dev/infiniband/umad0
+id_ext=0002c90200226cf4,ioc_guid=0002c90200226cf4,
+dgid=fe800000000000000002c90200226cf5,pkey=ffff,service_id=0002c90200226cf4
+[root@lab104 ~]# echo id_ext=0002c90200226cf4,ioc_guid=0002c90200226cf4,
+dgid=fe800000000000000002c90200226cf5,pkey=ffff,service_id=0002c90200226cf4 >
+/sys/class/infiniband_srp/srp-mthca0-1/add_target
+
+OR
+
+ You can edit /etc/infiniband/openib.conf to load srp driver and srp HA daemon
+automatically ie. set SRP_LOAD=yes, and SRPHA_ENABLE=yes
+ To set up and use high availability feature you need dm-multipath driver
+and multipath tool
+ Please refer to OFED-1.x SRP's user manual for more in-details instructions
+on how-to enable/use HA feature
+
+To minimize QUEUEFULL conditions, you can apply scst_increase_max_tgt_cmds
+patch from SRPT package from http://sourceforge.net/project/showfiles.php?group_id=110471