From 0489f4287b9606f7cc4e65820f5978d398d9775d Mon Sep 17 00:00:00 2001 From: Vladislav Bolkhovitin Date: Fri, 11 Jan 2008 10:03:48 +0000 Subject: [PATCH] - Fixed possible crash on SN slots overflow - Docs updates - Other minor changes git-svn-id: http://svn.code.sf.net/p/scst/svn/trunk@243 d57e44dd-8a1f-0410-8b47-8ef2f437770f --- iscsi-scst/README | 17 ++++++---- qla2x00t/qla2x00-target/qla2x00t.c | 3 +- scst/README | 44 +++++++++++++++++++++++++ scst/ToDo | 5 +++ scst/include/scsi_tgt.h | 6 ++-- scst/src/scst_lib.c | 2 +- scst/src/scst_priv.h | 2 +- scst/src/scst_targ.c | 52 ++++++++++++++++++------------ 8 files changed, 97 insertions(+), 34 deletions(-) diff --git a/iscsi-scst/README b/iscsi-scst/README index fdd644353..fc6741587 100644 --- a/iscsi-scst/README +++ b/iscsi-scst/README @@ -32,6 +32,10 @@ Installation Basically as in README-IET, where file names are changed as specified above. +If you experience problems during kernel module load or running, check +your system and/or kernel logs (or run dmesg command for the few most +recent kernel messages). + To use full power of TCP zero-copy transmit functions, especially dealing with user space supplied via scst_user module memory, iSCSI-SCST needs to be notified when Linux networking finished data transmission. @@ -46,7 +50,7 @@ original IET behavior, when for data transmission: handlers) usage of SGV cache on transmit path (READ-type commands) will be disabled. The performance hit will be not big, but performance will still remain better, than for IET, because SGV cache will remain - used on receive path and IET doesn't have such feature. + used on receive path while IET doesn't have such feature. - For user space allocated memory (scst_user handler) all transmitted data will be additionally copied into temporary TCP buffers. The @@ -103,13 +107,12 @@ means name of the target, for example: "Default_iqn.2007-05.com.example:storage.disk1.sys1.xyz", and add there all necessary LUNs. Check SCST README file for details. +Check SCST README file how to tune for the best performance. + If under high load you experience I/O stalls or see in the kernel log -abort or reset messages then try to reduce QueuedCommands parameter in -iscsi-scstd.conf file for the corresponding target. Particularly, it is -known that the default value 32 sometimes too high if you do intensive -writes from VMware on a target disk, which use LVM in the snapshot mode. -In this case value like 16 or even 10 depending of your backstorage -speed could be more appropriate. +abort or reset messages, then try to reduce QueuedCommands parameter in +iscsi-scstd.conf file for the corresponding target. See also SCST README +file for more details about that issue. CAUTION: Working of target and initiator on the same host isn't ======== supported. See SCST README file for details. diff --git a/qla2x00t/qla2x00-target/qla2x00t.c b/qla2x00t/qla2x00-target/qla2x00t.c index 8f16029ec..f128292d5 100644 --- a/qla2x00t/qla2x00-target/qla2x00t.c +++ b/qla2x00t/qla2x00-target/qla2x00t.c @@ -1613,8 +1613,7 @@ static void q2t_task_mgmt_fn_done(struct scst_mgmt_cmd *scst_mcmd) TRACE_ENTRY(); - TRACE((scst_mcmd->fn == SCST_ABORT_TASK) ? TRACE_MGMT_MINOR : TRACE_MGMT, - "scst_mcmd (%p) status %#x state %#x", scst_mcmd, + TRACE_MGMT_DBG("scst_mcmd (%p) status %#x state %#x", scst_mcmd, scst_mcmd->status, scst_mcmd->state); mcmd = scst_mgmt_cmd_get_tgt_priv(scst_mcmd); diff --git a/scst/README b/scst/README index 838b237ca..aa4177aed 100644 --- a/scst/README +++ b/scst/README @@ -87,6 +87,9 @@ Then, to see your devices remotely, you need to add them to at least are seen remotely. There must be LUN 0 in each security group, i.e. LUs numeration must not start from, e.g., 1. +If you experience problems during modules load or running, check your +kernel logs (or run dmesg command for the few most recent messages). + IMPORTANT: Without loading appropriate device handler, corresponding devices ========= will be invisible for remote initiators, which could lead to holes in the LUN addressing, so automatic device scanning by remote SCSI @@ -713,6 +716,47 @@ IMPORTANT: If you use on initiator some versions of Windows (at least W2K) See also important notes about setting block sizes >512 bytes for VDISK FILEIO devices above. +What if target's backstorage is too slow +---------------------------------------- + +If under high load you experience I/O stalls or see in the kernel log on +the target abort or reset messages, then your backstorage is too slow +for your target link speed and amount of simultaneously queued commands. +Simply processing of one or more commands takes too long, so initiator +decides that they are stuck on the target and tries to recover. +Particularly, it is known that the default amount of simultaneously +queued commands (48) is sometimes too high if you do intensive writes +from VMware on a target disk, which uses LVM in the snapshot mode. In +this case value like 16 or even 10 depending of your backstorage speed +could be more appropriate. + +Unfortunately, currently SCST lacks dynamic I/O flow control, when the +queue depth on the target is dynamically decreased/increased based on +how slow/fast the backstorage speed comparing to the target link. So, +there are only 4 possible actions, which you can do to workaround or fix +this issue: + +1. Ignore incoming task management (TM) commands. It's fine if there are +not too many of them, i.e. if the backstorage isn't too slow. + +2. Decrease /sys/block/sdX/device/queue_depth on the initiator in case +if it's Linux (see below how) and/or SCST_MAX_TGT_DEV_COMMANDS constant +in scst_priv.h file until you stop seeing incoming TM commands. + +3. Insrease speed of the target's backstorage. + +4. Implement in SCST the dynamic I/O flow control. + +To decrease device queue depth on Linux initiators run command: + +# echo Y >/sys/block/sdX/device/queue_depth + +where Y is the new number of simultaneously queued commands, X - your +imported device letter, like 'a' for sda device. There are no special +limitations for Y value, it can be any value from 1 to possible maximum +(usually, 32), so start from dividing the current value on 2, i.e. set +16, if /sys/block/sdX/device/queue_depth contains 32. + Credits ------- diff --git a/scst/ToDo b/scst/ToDo index 8df903aba..3d1b28a28 100644 --- a/scst/ToDo +++ b/scst/ToDo @@ -8,6 +8,11 @@ To be done the page cache (in order to avoid data copy between it and internal buffers). Requires modifications of the kernel. + - Dynamic I/O flow control, when the device queue depth on the target + will be dynamically decreased/increased based on how slow/fast the + backstorage speed comparing to the target link for current IO + pattern. + - Fix in-kernel O_DIRECT mode. - Close integration with Linux initiator SCSI mil-level, including diff --git a/scst/include/scsi_tgt.h b/scst/include/scsi_tgt.h index 58e9f4ca4..c0bd8f27e 100644 --- a/scst/include/scsi_tgt.h +++ b/scst/include/scsi_tgt.h @@ -1427,11 +1427,11 @@ struct scst_tgt_dev * Set if the prev cmd was ORDERED. Size must allow unprotected * modifications */ - unsigned long prev_cmd_ordered; + unsigned long prev_cmd_ordered; - int num_free_sn_slots; + int num_free_sn_slots; /* if it's <0, then all slots are busy */ atomic_t *cur_sn_slot; - atomic_t sn_slots[10]; + atomic_t sn_slots[15]; /* Used for storage of dev handler private stuff */ void *dh_priv; diff --git a/scst/src/scst_lib.c b/scst/src/scst_lib.c index fb7847c4f..c6b2e86e5 100644 --- a/scst/src/scst_lib.c +++ b/scst/src/scst_lib.c @@ -415,7 +415,7 @@ static struct scst_tgt_dev *scst_alloc_add_tgt_dev(struct scst_session *sess, INIT_LIST_HEAD(&tgt_dev->deferred_cmd_list); INIT_LIST_HEAD(&tgt_dev->skipped_sn_list); tgt_dev->expected_sn = 1; - tgt_dev->num_free_sn_slots = ARRAY_SIZE(tgt_dev->sn_slots); + tgt_dev->num_free_sn_slots = ARRAY_SIZE(tgt_dev->sn_slots)-1; tgt_dev->cur_sn_slot = &tgt_dev->sn_slots[0]; for(i = 0; i < (int)ARRAY_SIZE(tgt_dev->sn_slots); i++) atomic_set(&tgt_dev->sn_slots[i], 0); diff --git a/scst/src/scst_priv.h b/scst/src/scst_priv.h index 1c2fc6dac..b678ca45e 100644 --- a/scst/src/scst_priv.h +++ b/scst/src/scst_priv.h @@ -103,7 +103,7 @@ extern unsigned long scst_trace_flag; ** Maximum count of uncompleted commands that an initiator could ** queue on any device. Then it will start getting TASK QUEUE FULL status. **/ -#define SCST_MAX_TGT_DEV_COMMANDS 32 +#define SCST_MAX_TGT_DEV_COMMANDS 48 /** ** Maximum count of uncompleted commands that could be queued on any device. diff --git a/scst/src/scst_targ.c b/scst/src/scst_targ.c index ab8ed86ae..00c01f288 100644 --- a/scst/src/scst_targ.c +++ b/scst/src/scst_targ.c @@ -1772,14 +1772,16 @@ void scst_inc_expected_sn(struct scst_tgt_dev *tgt_dev, atomic_t *slot) TRACE_SN("Slot is 0 (num_free_sn_slots=%d)", tgt_dev->num_free_sn_slots); - if (tgt_dev->num_free_sn_slots != ARRAY_SIZE(tgt_dev->sn_slots)) { + if (tgt_dev->num_free_sn_slots < (int)ARRAY_SIZE(tgt_dev->sn_slots)-1) { spin_lock_irq(&tgt_dev->sn_lock); - if (tgt_dev->num_free_sn_slots != ARRAY_SIZE(tgt_dev->sn_slots)) { + if (likely(tgt_dev->num_free_sn_slots < (int)ARRAY_SIZE(tgt_dev->sn_slots)-1)) { + if (tgt_dev->num_free_sn_slots < 0) + tgt_dev->cur_sn_slot = slot; + smp_mb(); /* to be in-sync with SIMPLE case in scst_cmd_set_sn() */ tgt_dev->num_free_sn_slots++; TRACE_SN("Incremented num_free_sn_slots (%d)", tgt_dev->num_free_sn_slots); - if (tgt_dev->num_free_sn_slots == 0) - tgt_dev->cur_sn_slot = slot; + } spin_unlock_irq(&tgt_dev->sn_lock); } @@ -2615,7 +2617,8 @@ static void scst_cmd_set_sn(struct scst_cmd *cmd) tgt_dev->prev_cmd_ordered = 0; } else { - TRACE(TRACE_MINOR, "%s", "Not enough SN slots"); + TRACE(TRACE_MINOR, "***WARNING*** Not enough SN slots " + "%d", ARRAY_SIZE(tgt_dev->sn_slots)); goto ordered; } break; @@ -2626,21 +2629,30 @@ static void scst_cmd_set_sn(struct scst_cmd *cmd) ordered: if (!tgt_dev->prev_cmd_ordered) { spin_lock_irqsave(&tgt_dev->sn_lock, flags); - tgt_dev->num_free_sn_slots--; - smp_mb(); - if ((tgt_dev->num_free_sn_slots >= 0) && - (atomic_read(tgt_dev->cur_sn_slot) > 0)) { - do { - tgt_dev->cur_sn_slot++; - if (tgt_dev->cur_sn_slot == - tgt_dev->sn_slots + - ARRAY_SIZE(tgt_dev->sn_slots)) - tgt_dev->cur_sn_slot = tgt_dev->sn_slots; - } while(atomic_read(tgt_dev->cur_sn_slot) != 0); - TRACE_SN("New cur SN slot %zd", - tgt_dev->cur_sn_slot-tgt_dev->sn_slots); - } else - tgt_dev->num_free_sn_slots++; + if (tgt_dev->num_free_sn_slots >= 0) { + tgt_dev->num_free_sn_slots--; + if (tgt_dev->num_free_sn_slots >= 0) { + int i = 0; + /* + * Commands can finish in any order, so we don't + * know, which slot is empty. + */ + while(1) { + tgt_dev->cur_sn_slot++; + if (tgt_dev->cur_sn_slot == tgt_dev->sn_slots + + ARRAY_SIZE(tgt_dev->sn_slots)) + tgt_dev->cur_sn_slot = tgt_dev->sn_slots; + + if (atomic_read(tgt_dev->cur_sn_slot) == 0) + break; + + i++; + sBUG_ON(i == ARRAY_SIZE(tgt_dev->sn_slots)); + } + TRACE_SN("New cur SN slot %zd", + tgt_dev->cur_sn_slot-tgt_dev->sn_slots); + } + } spin_unlock_irqrestore(&tgt_dev->sn_lock, flags); } tgt_dev->prev_cmd_ordered = 1;