From 41c3515516cf9d310cd77f045420d2a110faa6c3 Mon Sep 17 00:00:00 2001 From: Vladislav Bolkhovitin Date: Wed, 10 Dec 2008 10:55:01 +0000 Subject: [PATCH] - Memory barriers cleanup. Comments for them improved - Small docs update - srpt/README_in-tree added git-svn-id: http://svn.code.sf.net/p/scst/svn/trunk@614 d57e44dd-8a1f-0410-8b47-8ef2f437770f --- iscsi-scst/kernel/iscsi.c | 10 +++- iscsi-scst/kernel/iscsi.h | 7 ++- scripts/generate-kernel-patch | 2 +- scst/README | 2 + scst/README_in-tree | 2 + scst/src/dev_handlers/scst_user.c | 7 +++ scst/src/dev_handlers/scst_vdisk.c | 1 - scst/src/scst_lib.c | 22 +++++--- scst/src/scst_main.c | 13 ++++- scst/src/scst_priv.h | 3 ++ scst/src/scst_targ.c | 45 +++++++++++----- srpt/README_in-tree | 85 ++++++++++++++++++++++++++++++ 12 files changed, 174 insertions(+), 25 deletions(-) create mode 100644 srpt/README_in-tree diff --git a/iscsi-scst/kernel/iscsi.c b/iscsi-scst/kernel/iscsi.c index 6f6833ac1..64a38d920 100644 --- a/iscsi-scst/kernel/iscsi.c +++ b/iscsi-scst/kernel/iscsi.c @@ -2512,7 +2512,9 @@ static inline void iscsi_set_state_wake_up(struct iscsi_cmnd *req, * We use wait_event() to wait for the state change, but it checks its * condition without any protection, so without cmnd_get() it is * possible that req will die "immediately" after the state assignment - * and wake_up() will operate on dead data. + * and wake_up() will operate on dead data. We use the ordered version + * of cmnd_get(), because "get" must be done before the state + * assignment. */ cmnd_get_ordered(req); req->scst_state = new_state; @@ -2710,6 +2712,12 @@ static int iscsi_xmit_response(struct scst_cmd *scst_cmd) sBUG(); #endif + /* + * "_ordered" here to protect from reorder, which can lead to + * preliminary connection destroy in req_cmnd_release(). Just in + * case, actually, because reordering shouldn't go so far, but who + * knows.. + */ conn_get_ordered(conn); req_cmnd_release(req); iscsi_try_local_processing(conn); diff --git a/iscsi-scst/kernel/iscsi.h b/iscsi-scst/kernel/iscsi.h index 52693c324..750db9cd8 100644 --- a/iscsi-scst/kernel/iscsi.h +++ b/iscsi-scst/kernel/iscsi.h @@ -475,6 +475,7 @@ static inline void cmnd_get(struct iscsi_cmnd *cmnd) static inline void cmnd_get_ordered(struct iscsi_cmnd *cmnd) { cmnd_get(cmnd); + /* See comments for each cmnd_get_ordered() use */ smp_mb__after_atomic_inc(); } @@ -546,6 +547,7 @@ static inline void conn_get(struct iscsi_conn *conn) static inline void conn_get_ordered(struct iscsi_conn *conn) { conn_get(conn); + /* See comments for each conn_get_ordered() use */ smp_mb__after_atomic_inc(); } @@ -556,8 +558,9 @@ static inline void conn_put(struct iscsi_conn *conn) sBUG_ON(atomic_read(&conn->conn_ref_cnt) == 0); /* - * It always ordered to protect from undesired side effects like - * accessing just destroyed object because of this *_dec() reordering. + * Make it always ordered to protect from undesired side effects like + * accessing just destroyed by close_conn() conn caused by reordering + * of this atomic_dec(). */ smp_mb__before_atomic_dec(); atomic_dec(&conn->conn_ref_cnt); diff --git a/scripts/generate-kernel-patch b/scripts/generate-kernel-patch index 5837c690d..4c07cdd60 100755 --- a/scripts/generate-kernel-patch +++ b/scripts/generate-kernel-patch @@ -359,7 +359,7 @@ else fi \ | process_patch "srpt.diff" -add_file "srpt/README" "Documentation/scst/README.srpt" \ +add_file "srpt/README_in-tree" "Documentation/scst/README.srpt" \ | process_patch "srpt-doc.diff" diff --git a/scst/README b/scst/README index 0c9e6a92e..f9a02e5ea 100644 --- a/scst/README +++ b/scst/README @@ -274,6 +274,8 @@ For changing VMSPLIT option (CONFIG_VMSPLIT to be precise) you should in - General setup->Configure standard kernel features (for small systems): ON + - General setup->Prompt for development and/or incomplete code/drivers: ON + - Processor type and features->High Memory Support: OFF - Processor type and features->Memory split: according to amount of diff --git a/scst/README_in-tree b/scst/README_in-tree index fb09671bf..5fb3adf33 100644 --- a/scst/README_in-tree +++ b/scst/README_in-tree @@ -222,6 +222,8 @@ For changing VMSPLIT option (CONFIG_VMSPLIT to be precise) you should in - General setup->Configure standard kernel features (for small systems): ON + - General setup->Prompt for development and/or incomplete code/drivers: ON + - Processor type and features->High Memory Support: OFF - Processor type and features->Memory split: according to amount of diff --git a/scst/src/dev_handlers/scst_user.c b/scst/src/dev_handlers/scst_user.c index 04bbd0070..df9b6bddf 100644 --- a/scst/src/dev_handlers/scst_user.c +++ b/scst/src/dev_handlers/scst_user.c @@ -1270,6 +1270,11 @@ static int dev_user_process_reply_exec(struct scst_user_cmd *ucmd, if (unlikely((cmd->data_direction == SCST_DATA_READ) || (cmd->resp_data_len != 0))) goto out_inval; + /* + * background_exec assignment must be after ucmd get. + * Otherwise, due to reorder, in dev_user_process_reply() + * it is possible that ucmd is destroyed before it "got" here. + */ ucmd_get_ordered(ucmd); ucmd->background_exec = 1; TRACE_DBG("Background ucmd %p", ucmd); @@ -1383,6 +1388,8 @@ static int dev_user_process_reply(struct scst_user_dev *dev, goto out_unlock; } + /* To sync. with dev_user_process_reply_exec(). See comment there. */ + smp_mb(); if (ucmd->background_exec) { state = UCMD_STATE_EXECING; goto unlock_process; diff --git a/scst/src/dev_handlers/scst_vdisk.c b/scst/src/dev_handlers/scst_vdisk.c index 47231a67c..8445a3087 100644 --- a/scst/src/dev_handlers/scst_vdisk.c +++ b/scst/src/dev_handlers/scst_vdisk.c @@ -2450,7 +2450,6 @@ static void blockio_exec_rw(struct scst_cmd *cmd, struct scst_vdisk_thr *thr, /* +1 to prevent erroneous too early command completion */ atomic_set(&blockio_work->bios_inflight, bios+1); - smp_wmb(); while (hbio) { bio = hbio; diff --git a/scst/src/scst_lib.c b/scst/src/scst_lib.c index 37a3a5306..9cf80fa9c 100644 --- a/scst/src/scst_lib.c +++ b/scst/src/scst_lib.c @@ -1528,9 +1528,10 @@ void scst_check_retries(struct scst_tgt *tgt) /* * We don't worry about overflow of finished_cmds, because we check - * only for its change + * only for its change. */ atomic_inc(&tgt->finished_cmds); + /* See comment in scst_queue_retry_cmd() */ smp_mb__after_atomic_inc(); if (unlikely(tgt->retry_cmds > 0)) { struct scst_cmd *c, *tc; @@ -1541,8 +1542,7 @@ void scst_check_retries(struct scst_tgt *tgt) spin_lock_irqsave(&tgt->tgt_lock, flags); list_for_each_entry_safe(c, tc, &tgt->retry_cmd_list, - cmd_list_entry) - { + cmd_list_entry) { tgt->retry_cmds--; TRACE_RETRY("Moving retry cmd %p to head of active " @@ -3125,7 +3125,13 @@ static void scst_block_dev(struct scst_device *dev, int outstanding) __scst_block_dev(dev); spin_unlock_bh(&dev->dev_lock); - /* spin_unlock_bh() doesn't provide the necessary memory barrier */ + /* + * Memory barrier is necessary here, because we need to read + * on_dev_count in wait_event() below after we increased block_count. + * Otherwise, we can miss wake up in scst_dec_on_dev_cmd(). + * We use the explicit barrier, because spin_unlock_bh() doesn't + * provide the necessary memory barrier functionality. + */ smp_mb(); TRACE_MGMT_DBG("Waiting during blocking outstanding %d (on_dev_count " @@ -3211,7 +3217,6 @@ repeat: spin_lock_bh(&dev->dev_lock); if (unlikely(test_bit(SCST_CMD_ABORTED, &cmd->cmd_flags))) goto out_unlock; - barrier(); /* to reread block_count */ if (dev->block_count > 0) { scst_dec_on_dev_cmd(cmd); TRACE_MGMT_DBG("Delaying cmd %p due to blocking or " @@ -3230,7 +3235,6 @@ repeat: } if (unlikely(dev->dev_serialized)) { spin_lock_bh(&dev->dev_lock); - barrier(); /* to reread block_count */ if (dev->block_count == 0) { TRACE_MGMT_DBG("cmd %p (tag %llu), blocking further " "cmds due to serializing (dev %p)", cmd, @@ -3405,7 +3409,6 @@ void scst_xmit_process_aborted_cmd(struct scst_cmd *cmd) scst_done_cmd_mgmt(cmd); - smp_rmb(); if (test_bit(SCST_CMD_ABORTED_OTHER, &cmd->cmd_flags)) { if (cmd->completed) { /* It's completed and it's OK to return its result */ @@ -3548,6 +3551,7 @@ static void tm_dbg_timer_fn(unsigned long arg) { TRACE_MGMT_DBG("%s", "delayed cmd timer expired"); tm_dbg_flags.tm_dbg_release = 1; + /* Used to make sure that all woken up threads see the new value */ smp_wmb(); wake_up_all(tm_dbg_p_cmd_list_waitQ); } @@ -3763,6 +3767,10 @@ void tm_dbg_task_mgmt(struct scst_device *dev, const char *fn, int force) tm_dbg_delayed_cmds_count); tm_dbg_change_state(); tm_dbg_flags.tm_dbg_release = 1; + /* + * Used to make sure that all woken up threads see the new + * value. + */ smp_wmb(); if (tm_dbg_p_cmd_list_waitQ != NULL) wake_up_all(tm_dbg_p_cmd_list_waitQ); diff --git a/scst/src/scst_main.c b/scst/src/scst_main.c index b671c0bd2..173fac173 100644 --- a/scst/src/scst_main.c +++ b/scst/src/scst_main.c @@ -501,13 +501,18 @@ int scst_suspend_activity(bool interruptible) set_bit(SCST_FLAG_SUSPENDING, &scst_flags); set_bit(SCST_FLAG_SUSPENDED, &scst_flags); + /* + * Assignment of SCST_FLAG_SUSPENDING and SCST_FLAG_SUSPENDED must be + * ordered with scst_cmd_count. Otherwise lockless logic in + * scst_translate_lun() and scst_mgmt_translate_lun() won't work. + */ smp_mb__after_set_bit(); /* * See comment in scst_user.c::dev_user_task_mgmt_fn() for more * information about scst_user behavior. * - * ToDo: make the global suspending unneeded (Switch to per-device + * ToDo: make the global suspending unneeded (switch to per-device * reference counting? That would mean to switch off from lockless * implementation of scst_translate_lun().. ) */ @@ -530,6 +535,7 @@ int scst_suspend_activity(bool interruptible) goto out_clear; clear_bit(SCST_FLAG_SUSPENDING, &scst_flags); + /* See comment about smp_mb() above */ smp_mb__after_clear_bit(); TRACE_MGMT_DBG("Waiting for %d active commands finally to complete", @@ -551,6 +557,7 @@ out: out_clear: clear_bit(SCST_FLAG_SUSPENDING, &scst_flags); + /* See comment about smp_mb() above */ smp_mb__after_clear_bit(); goto out_up; } @@ -568,6 +575,10 @@ static void __scst_resume_activity(void) goto out; clear_bit(SCST_FLAG_SUSPENDED, &scst_flags); + /* + * The barrier is needed to make sure all woken up threads see the + * cleared flag. Not sure if it's really needed, but let's be safe. + */ smp_mb__after_clear_bit(); list_for_each_entry(l, &scst_cmd_lists_list, lists_list_entry) { diff --git a/scst/src/scst_priv.h b/scst/src/scst_priv.h index 84b069892..b14dbf989 100644 --- a/scst/src/scst_priv.h +++ b/scst/src/scst_priv.h @@ -420,6 +420,7 @@ static inline void scst_dec_on_dev_cmd(struct scst_cmd *cmd) scst_unblock_dev(dev); atomic_dec(&dev->on_dev_count); + /* See comment in scst_block_dev() */ smp_mb__after_atomic_dec(); TRACE_DBG("New on_dev_count %d", atomic_read(&dev->on_dev_count)); @@ -438,6 +439,7 @@ static inline void __scst_get(int barrier) TRACE_DBG("Incrementing scst_cmd_count(%d)", atomic_read(&scst_cmd_count)); + /* See comment about smp_mb() in scst_suspend_activity() */ if (barrier) smp_mb__after_atomic_inc(); } @@ -446,6 +448,7 @@ static inline void __scst_put(void) { int f; f = atomic_dec_and_test(&scst_cmd_count); + /* See comment about smp_mb() in scst_suspend_activity() */ if (f && unlikely(test_bit(SCST_FLAG_SUSPENDED, &scst_flags))) { TRACE_MGMT_DBG("%s", "Waking up scst_dev_cmd_waitQ"); wake_up_all(&scst_dev_cmd_waitQ); diff --git a/scst/src/scst_targ.c b/scst/src/scst_targ.c index 43bb00282..4b470c241 100644 --- a/scst/src/scst_targ.c +++ b/scst/src/scst_targ.c @@ -120,7 +120,8 @@ static int scst_init_cmd(struct scst_cmd *cmd, enum scst_exec_context *context) } /* * Memory barrier isn't necessary here, because CPU appears to - * be self-consistent + * be self-consistent and we don't care about the race, described + * in comment in scst_do_job_init(). */ rc = __scst_init_cmd(cmd); @@ -885,6 +886,12 @@ static int scst_queue_retry_cmd(struct scst_cmd *cmd, int finished_cmds) spin_lock_irqsave(&tgt->tgt_lock, flags); tgt->retry_cmds++; + /* + * Memory barrier is needed here, because we need the exact order + * between the read and write between retry_cmds and finished_cmds to + * not miss the case when a command finished while we queuing it for + * retry after the finished_cmds check. + */ smp_mb(); TRACE_RETRY("TGT QUEUE FULL: incrementing retry_cmds %d", tgt->retry_cmds); @@ -1684,7 +1691,6 @@ int scst_check_local_events(struct scst_cmd *cmd) * was_reset. */ spin_lock_bh(&dev->dev_lock); - barrier(); /* to reread was_reset */ if (dev->scsi_dev->was_reset) { TRACE(TRACE_MGMT, "was_reset is %d", 1); scst_set_cmd_error(cmd, @@ -1764,12 +1770,16 @@ void scst_inc_expected_sn(struct scst_tgt_dev *tgt_dev, atomic_t *slot) inc: /* - * No locks is needed, because only one thread at time can - * be here (serialized by sn). Also it is supposed that there - * could not be half-incremented halves. + * No protection of expected_sn is needed, because only one thread + * at time can be here (serialized by sn). Also it is supposed that + * there could not be half-incremented halves. */ tgt_dev->expected_sn++; - smp_mb(); /* write must be before def_cmd_count read */ + /* + * Write must be before def_cmd_count read to be in sync. with + * scst_post_exec_sn(). See comment in scst_send_for_exec(). + */ + smp_mb(); TRACE_SN("Next expected_sn: %ld", tgt_dev->expected_sn); out: @@ -2168,6 +2178,17 @@ static int scst_send_for_exec(struct scst_cmd **active_cmd) spin_lock_irq(&tgt_dev->sn_lock); tgt_dev->def_cmd_count++; + /* + * Memory barrier is needed here to implement lockless fast + * path. We need the exact order of read and write between + * def_cmd_count and expected_sn. Otherwise, we can miss case, + * when expected_sn was changed to be equal to cmd->sn while + * we are queuing cmd the deferred list after the expected_sn + * below. It will lead to a forever stuck command. But with + * the barrier in such case __scst_check_deferred_commands() + * will be called and it will take sn_lock, so we will be + * synchronized. + */ smp_mb(); expected_sn = tgt_dev->expected_sn; @@ -3044,6 +3065,7 @@ static int scst_translate_lun(struct scst_cmd *cmd) TRACE_ENTRY(); + /* See comment about smp_mb() in scst_suspend_activity() */ __scst_get(1); if (likely(!test_bit(SCST_FLAG_SUSPENDED, &scst_flags))) { @@ -3209,11 +3231,11 @@ restart: * it's inserting to it, but another command at the same time * seeing init cmd list empty and goes directly, because it * could affect only commands from the same initiator to the - * same tgt_dev, but init_cmd_done() doesn't guarantee the order - * in case of simultaneous such calls anyway. + * same tgt_dev, but scst_cmd_init_done*() doesn't guarantee + * the order in case of simultaneous such calls anyway. */ TRACE_MGMT_DBG("Deleting cmd %p from init cmd list", cmd); - smp_wmb(); + smp_wmb(); /* enforce the required order */ list_del(&cmd->cmd_list_entry); spin_unlock(&scst_init_lock); @@ -3571,6 +3593,7 @@ static int scst_mgmt_translate_lun(struct scst_mgmt_cmd *mcmd) TRACE_DBG("Finding tgt_dev for mgmt cmd %p (lun %lld)", mcmd, (long long unsigned int)mcmd->lun); + /* See comment about smp_mb() in scst_suspend_activity() */ __scst_get(1); if (unlikely(test_bit(SCST_FLAG_SUSPENDED, &scst_flags) && @@ -3817,10 +3840,8 @@ void scst_abort_cmd(struct scst_cmd *cmd, struct scst_mgmt_cmd *mcmd, if (other_ini) { /* Might be necessary if command aborted several times */ - if (!test_bit(SCST_CMD_ABORTED, &cmd->cmd_flags)) { + if (!test_bit(SCST_CMD_ABORTED, &cmd->cmd_flags)) set_bit(SCST_CMD_ABORTED_OTHER, &cmd->cmd_flags); - smp_mb__after_set_bit(); - } } else { /* Might be necessary if command aborted several times */ clear_bit(SCST_CMD_ABORTED_OTHER, &cmd->cmd_flags); diff --git a/srpt/README_in-tree b/srpt/README_in-tree new file mode 100644 index 000000000..327ed0907 --- /dev/null +++ b/srpt/README_in-tree @@ -0,0 +1,85 @@ +SCSI RDMA Protocol (SRP) Target driver for Linux +================================================= + +The SRP Target driver is designed to work directly on top of the +OpenFabrics OFED-1.x software stack (http://www.openfabrics.org) or +the Infiniband drivers in the Linux kernel tree +(http://www.kernel.org). The SRP target driver also interfaces with +the generic SCSI target mid-level driver called SCST +(http://scst.sourceforge.net). + +How-to run +----------- + +A. On srp target machine +1. Please refer to SCST's README for loading scst driver and its +dev_handlers drivers (scst_disk, scst_vdisk block or file IO mode, nullio, ...) + +Example 1: working with real back-end scsi disks +a. modprobe scst +b. modprobe scst_disk +c. cat /proc/scsi_tgt/scsi_tgt + +ibstor00:~ # cat /proc/scsi_tgt/scsi_tgt +Device (host:ch:id:lun or name) Device handler +0:0:0:0 dev_disk +4:0:0:0 dev_disk +5:0:0:0 dev_disk +6:0:0:0 dev_disk +7:0:0:0 dev_disk + +Now you want to exclude the first scsi disk and expose the last 4 scsi disks as +IB/SRP luns for I/O +echo "add 4:0:0:0 0" >/proc/scsi_tgt/groups/Default/devices +echo "add 5:0:0:0 1" >/proc/scsi_tgt/groups/Default/devices +echo "add 6:0:0:0 2" >/proc/scsi_tgt/groups/Default/devices +echo "add 7:0:0:0 3" >/proc/scsi_tgt/groups/Default/devices + +Example 2: working with VDISK FILEIO mode (using md0 device and file 10G-file) +a. modprobe scst +b. modprobe scst_vdisk +c. echo "open vdisk0 /dev/md0" > /proc/scsi_tgt/vdisk/vdisk +d. echo "open vdisk1 /10G-file" > /proc/scsi_tgt/vdisk/vdisk +e. echo "add vdisk0 0" >/proc/scsi_tgt/groups/Default/devices +f. echo "add vdisk1 1" >/proc/scsi_tgt/groups/Default/devices + +Example 3: working with VDISK BLOCKIO mode (using md0 device, sda, and cciss/c1d0) +a. modprobe scst +b. modprobe scst_vdisk +c. echo "open vdisk0 /dev/md0 BLOCKIO" > /proc/scsi_tgt/vdisk/vdisk +d. echo "open vdisk1 /dev/sda BLOCKIO" > /proc/scsi_tgt/vdisk/vdisk +e. echo "open vdisk2 /dev/cciss/c1d0 BLOCKIO" > /proc/scsi_tgt/vdisk/vdisk +f. echo "add vdisk0 0" >/proc/scsi_tgt/groups/Default/devices +g. echo "add vdisk1 1" >/proc/scsi_tgt/groups/Default/devices +h. echo "add vdisk2 2" >/proc/scsi_tgt/groups/Default/devices + +2. modprobe ib_srpt + + +B. On initiator machines you can manualy do the following steps: +1. modprobe ib_srp +2. ipsrpdm -c (to discover new SRP target) +3. echo > /sys/class/infiniband_srp/srp-mthca0-1/add_target +4. fdisk -l (will show new discovered scsi disks) + +Example: +Assume that you use port 1 of first HCA in the system ie. mthca0 + +[root@lab104 ~]# ibsrpdm -c -d /dev/infiniband/umad0 +id_ext=0002c90200226cf4,ioc_guid=0002c90200226cf4, +dgid=fe800000000000000002c90200226cf5,pkey=ffff,service_id=0002c90200226cf4 +[root@lab104 ~]# echo id_ext=0002c90200226cf4,ioc_guid=0002c90200226cf4, +dgid=fe800000000000000002c90200226cf5,pkey=ffff,service_id=0002c90200226cf4 > +/sys/class/infiniband_srp/srp-mthca0-1/add_target + +OR + ++ You can edit /etc/infiniband/openib.conf to load srp driver and srp HA daemon +automatically ie. set SRP_LOAD=yes, and SRPHA_ENABLE=yes ++ To set up and use high availability feature you need dm-multipath driver +and multipath tool ++ Please refer to OFED-1.x SRP's user manual for more in-details instructions +on how-to enable/use HA feature + +To minimize QUEUEFULL conditions, you can apply scst_increase_max_tgt_cmds +patch from SRPT package from http://sourceforge.net/project/showfiles.php?group_id=110471