ib_srpt: Merged from trunk (svn merge -r2072:2459 https://scst.svn.sourceforge.net/svnroot/scst/trunk/srpt).

git-svn-id: http://svn.code.sf.net/p/scst/svn/branches/2.0.0.x@2468 d57e44dd-8a1f-0410-8b47-8ef2f437770f
2026-05-17 18:51:27 +00:00 · 2010-10-25 18:29:44 +00:00
parent 5b3780951d
commit 8ebd2bf6bb
4 changed files with 769 additions and 934 deletions
--- a/srpt/README
+++ b/srpt/README
@@ -62,27 +62,31 @@ Proceed as follows to compile and install the SRP target driver:
   chkconfig scst on

 The ib_srpt kernel module supports the following parameters:
-* srp_max_message_size (number)
+* srp_max_req_size (number)
  Maximum size of an SRP control message in bytes. Examples of SRP control
  messages are: login request, logout request, data transfer request, ...
  The larger this parameter, the more scatter/gather list elements can be
  sent at once. Use the following formula to compute an appropriate value
  for this parameter: 68 + 16 * (sg_tablesize). The default value of
  this parameter is 2116, which corresponds to an sg table size of 128.
+* srp_max_rsp_size (number)
+  Maximum size of an SRP response message in bytes. Sense data is sent back
+  via these messages towards the initiator. The default size is 256 bytes.
+  With this value there remains (256-36) = 220 bytes for sense data.
 * srp_max_rdma_size (number)
  Maximum number of bytes that may be transferred at once via RDMA. Defaults
  to 65536 bytes, which is sufficient to use the full bandwidth of low-latency
  HCAs. Increasing this value may decrease latency for applications
  transferring large amounts of data at once.
-* srpt_autodetect_cred_req (y or n, default n)
-  Whether or not to autodetect initiator support for SRP_CRED_REQ (initiators
-  with Linux kernel 2.6.37 or later only).  The use of SRP_CRED_REQ allows
-  ib_srpt to process workloads with large I/O depths more efficiently.
-  Note: enabling this mode causes the Windows SRP initiator to stop working.
 * srpt_srq_size (number, default 4095)
  ib_srpt uses a shared receive queue (SRQ) for processing incoming SRP
  requests. This number may have to be increased when a large number of
  initiator systems is accessing a single SRP target system.
+* srpt_sq_size (number, default 4096)
+  Per-channel InfiniBand send queue size. The default setting is sufficient
+  for a credit limit of 128. Changing this parameter to a smaller value may
+  cause RDMA requests to be retried and hence may slow down data transfer
+  severely.
 * thread (0, 1 or 2, default 1)
  Defines the context on which SRP requests are processed:
  * thread=0: do as much processing in IRQ context as possible. Results in
@@ -180,14 +184,23 @@ For more information, see also:
 * http://www.linux-ha.org/wiki/Main_Page


+Notes about ib_srpt
+-------------------
+
+* Unloading the kernel module ib_srpt while I/O is ongoing is supported.
+  However, it can take up to two minutes before unloading finishes. During
+  that time CPU usage will be high.
+
+
 Performance Notes - Initiator Side
 ----------------------------------

 * For latency sensitive applications, using the noop scheduler at the initiator
  side can give significantly better results than with other schedulers.

-* The following parameters have a small but measureable impact on SRP
+* The following parameters have a small but measurable impact on SRP
  performance:
+  * /sys/class/block/${dev}/queue/rotational
  * /sys/class/block/${dev}/queue/rq_affinity
  * /proc/irq/${ib_int_no}/smp_affinity

--- a/srpt/Testing.txt
+++ b/srpt/Testing.txt
@@ -1,4 +1,7 @@
-The following tests must be run at least before releasing a new SRPT version:
+ib_srpt Test Procedure
+======================
+
+At least the following tests must be run before releasing a new SRPT version:

 * Make sure that SRPT compiles and installs without triggering any
  compiler warning. Use the following command to compile and install SRPT:
@@ -18,36 +21,50 @@ The following tests must be run at least before releasing a new SRPT version:
  * Run the following command on the target system:                              
      ${SCST_TRUNK}/scripts/monitor-memory-usage | tee memlog.txt  
  * Run the following command on the initiator system:                           
-      for ((i=0;i<100000;i++)); do echo 'id_ext=0002c9030003cca2,ioc_guid=0002c9030003cca2,pkey=ffff,dgid=fe800000000000000002c9030003cca3,service_id=0002c9030003cca3' >/sys/class/infiniband_srp/srp-mlx4_0-1/add_target ; done                
+      target_id="$(/usr/sbin/ibsrpdm -c -d /dev/infiniband/umad0)"
+      for ((i=0;i<100000;i++)); do echo "$target_id" >/sys/class/infiniband_srp/srp-mlx4_0-1/add_target; done                

-* Verify that an I/O stress test runs fine by running the following command
-  where $dev1 and $dev2 are device nodes created by the SRP initiator and go
-  over different IB connections to the target:
+* Verify that the following I/O stress test does not report any errors even
+  when left running for several hours:

-    for dev in $dev1 $dev2
+    dev=...
+    umount /mnt
+    mkfs.ext4 -O ^has_journal $dev
+    while \
+      mount $dev /mnt && \
+      rm -rf /mnt/test* && \
+      fio --verify=md5 -rw=randwrite --size=10m --bs=4k \
+        --loops=1000000 --iodepth=64 --group_reporting --sync=1 --direct=1 \
+        --norandommap --ioengine=aio --directory=/mnt --name=test --thread \
+        --numjobs=80 --runtime=30 && \
+      fsck -N $dev
    do
-        fio --bs=4K --buffered=0 --rw=write --ioengine=psync --verify=sha256 \
-            --verify_fatal=1 --loops=1000                                    \
-            --name=partition1 --filename=$dev &
+      true
    done

-* Verify that the following I/O stress test runs fine:
+* Repeat the above test with SCST_MAX_TGT_DEV_COMMANDS set to 48 and after
+  having applied the patch below on ib_srpt.c. The expected result is that no
+  data corruption occurs but that on the target error messages are logged
+  about a negative req_lim value. The lowest expected value for req_lim is -15.

-    mkfs.ext2 $dev
-    mount $dev /mnt
-    while true; do { sleep 30; killall fio; } & { rm -rf /mnt/test* && fio --verify=md5 -rw=randwrite --size=10m --bs=4k --loops=1000000 --iodepth=64 --group_reporting --sync=1 --direct=1 --norandommap --ioengine=psync --directory=/mnt --name=test --thread --numjobs=80; }; done
-
-* Another I/O stress test:
-  (initiator) Write data pattern:
-    lmdd if=internal of=/dev/sdb opat=1 bs=1M count=1000
-  (target)    Verify data:
-    lmdd if=/dev/exported-block of=internal ipat=1 bs=1M count=1000 mismatch=1
-  (initiator) Verify data:
-    lmdd if=/dev/sdb of=internal ipat=1 bs=1M count=1000 mismatch=1
-    for ((i=0;i<40;i++)); do lmdd if=/dev/sdb of=internal ipat=1 bs=1M count=1000 mismatch=1 & done
+Index: srpt/src/ib_srpt.c
+===================================================================
+--- srpt/src/ib_srpt.c  (revision 2412)
+++ srpt/src/ib_srpt.c  (working copy)
+@@ -2411,7 +2411,7 @@
+        ch->max_ti_iu_len = it_iu_len;
+        rsp->buf_fmt = __constant_cpu_to_be16(SRP_BUF_FORMAT_DIRECT
+                                              | SRP_BUF_FORMAT_INDIRECT);
+-       rsp->req_lim_delta = cpu_to_be32(ch->rq_size);
+       rsp->req_lim_delta = cpu_to_be32(ch->rq_size + 16);
+        atomic_set(&ch->req_lim, ch->rq_size);
+ 
+        /* create cm reply */

 * Verify that a SCSI reset works properly by running the following command
-  on an initiator system:
+  on an initiator system (note: even with the latest Linux SRP initiator, the
+  command below triggers a kernel bug -- see also
+  https://bugzilla.kernel.org/show_bug.cgi?id=13893):

    sg_reset -d ${initiator_device}

@@ -57,7 +74,7 @@ The following tests must be run at least before releasing a new SRPT version:

  and the following commands on an initiator system:

-    target_id="id_ext=0002c9030003cca2,ioc_guid=0002c9030003cca2,dgid=fe800000000000000002c9030003cca3,pkey=ffff,service_id=0009030003cca2,ioc_guid=0002c9030003cca2,dgid=fe800000000000000002c9030003cca3,pkey=ffff,service_id=0002c9030003cca2"
+    target_id="$(/usr/sbin/ibsrpdm -c -d /dev/infiniband/umad0)"
    while true; do date; rmmod ib_srp; modprobe ib_srp; echo "${target_id}" > /sys/class/infiniband_srp/srp-mlx4_0-1/add_target; sleep 2; done

  and verify that nothing unexpected happens.
@@ -65,7 +82,7 @@ The following tests must be run at least before releasing a new SRPT version:
 * Log in twice from an initiator system, and verify that the first session
  receives a DREQ upon the second login:

-    target_id="id_ext=0002c9030003cca2,ioc_guid=0002c9030003cca2,dgid=fe800000000000000002c9030003cca3,pkey=ffff,service_id=0009030003cca2,ioc_guid=0002c9030003cca2,dgid=fe800000000000000002c9030003cca3,pkey=ffff,service_id=0002c9030003cca2"
+    target_id="$(/usr/sbin/ibsrpdm -c -d /dev/infiniband/umad0)"
    dmesg -c
    echo "${target_id}" > /sys/class/infiniband_srp/srp-mlx4_0-1/add_target
    dmesg -c
@@ -85,11 +102,24 @@ The following tests must be run at least before releasing a new SRPT version:

 * Test with multiple values of ib_srp_tablesize in the range 1..128.

-* Verify that the initiator does not lock up while running the command:
+* Verify that the initiator does not lock up while running the command below
+  (see also https://bugzilla.kernel.org/show_bug.cgi?id=14235):

-fio --bs=512 --buffered=0 --ioengine=sg --rw=read --invalidate=1 --thread --numjobs=8 --loops=10 --gtod_reduce=1 --group_reporting --name=/dev/sdb --filename=/dev/sdb
+    dev=...
+    fio --bs=512 --buffered=0 --ioengine=libaio --rw=read --invalidate=1 \
+        --thread --numjobs=8 --loops=10 --gtod_reduce=1 --group_reporting \
+        --name=$dev --filename=$dev

-  when target processing is delayed by the following command:
-
-echo 15 > /sys/module/ib_srpt/parameters/interrupt_processing_delay_in_us
+* Test whether queue overflow recovery works correctly as follows:
+  - On the target, reload ib_srpt with srpt_sq_size set to 64.
+  - On the initiator, run a direct I/O test with large block sizes, e.g.
+      scripts/blockdev-perftest -f -d -j -m 12 -M 24 /dev/sdb
+  - On the initiator, run the following two commands in parallel:
+      scripts/blockdev-perftest -r -d -j -m 12 -M 24 /dev/sdb
+      fio --verify=md5 -rw=randwrite --size=10m --bs=4k \
+        --loops=1000000 --iodepth=64 --group_reporting --sync=1 --direct=1 \
+        --norandommap --ioengine=aio --directory=/mnt --name=test --thread \
+        --numjobs=80 --runtime=30

+* Repeat the above tests for all three threading modes: thread=0, thread=1
+  and thread=2.
--- a/srpt/src/ib_srpt.c
+++ b/srpt/src/ib_srpt.c
--- a/srpt/src/ib_srpt.h
+++ b/srpt/src/ib_srpt.h
@@ -74,7 +74,7 @@ enum {
 	SRP_RDMA_WRITE_FROM_IOC = 0x20,

 	/*
-	 * srp_login_cmd::req_flags bitmasks. See also table 9 in the SRP r16a
+	 * srp_login_cmd.req_flags bitmasks. See also table 9 in the SRP r16a
 	 * document.
 	 */
 	SRP_MTCH_ACTION = 0x03, /* MULTI-CHANNEL ACTION */
@@ -83,14 +83,14 @@ enum {
 	SRP_AESOLNT = 0x40, /* asynchronous event solicited notification */

 	/*
-	 * srp_cmd::sol_nt / srp_tsk_mgmt::sol_not bitmasks. See also tables
+	 * srp_cmd.sol_nt / srp_tsk_mgmt.sol_not bitmasks. See also tables
 	 * 18 and 20 in the T10 r16a document.
 	 */
 	SRP_SCSOLNT = 0x02, /* SCSOLNT = successful solicited notification */
 	SRP_UCSOLNT = 0x04, /* UCSOLNT = unsuccessful solicited notification */

 	/*
-	 * srp_rsp::sol_not / srp_t_logout::sol_not bitmasks. See also tables
+	 * srp_rsp.sol_not / srp_t_logout.sol_not bitmasks. See also tables
 	 * 16 and 22 in the T10 r16a document.
 	 */
 	SRP_SOLNT = 0x01, /* SOLNT = solicited notification */
@@ -120,45 +120,24 @@ enum {
 	DEFAULT_SRPT_SRQ_SIZE = 4095,
 	MAX_SRPT_SRQ_SIZE = 65535,

-	MIN_MAX_MESSAGE_SIZE = 996,
-	DEFAULT_MAX_MESSAGE_SIZE
+	MIN_MAX_REQ_SIZE = 996,
+	DEFAULT_MAX_REQ_SIZE
 		= sizeof(struct srp_cmd)/*48*/
 		+ sizeof(struct srp_indirect_buf)/*20*/
 		+ 128 * sizeof(struct srp_direct_buf)/*16*/,

+	MIN_MAX_RSP_SIZE = sizeof(struct srp_rsp)/*36*/ + 4,
+	DEFAULT_MAX_RSP_SIZE = 256, /* leaves 220 bytes for sense data */
+
 	DEFAULT_MAX_RDMA_SIZE = 65536,
-
-	/*
-	 * Number of I/O contexts to be allocated for sending back requests
-	 * from the target to the initiator. Must be a power of two.
-	 */
-	TTI_IOCTX_COUNT = 2,
-	TTI_IOCTX_MASK = TTI_IOCTX_COUNT - 1,
 };

-/**
- * @SRPT_OP_TTI:  wr_id flag for marking requests sent by the target to the
- *                initiator.
- * @SRPT_OP_RECV: wr_id flag for marking receive operations.
- */
-enum {
-	SRPT_OP_TTI	= (1 << 30),
-	SRPT_OP_RECV	= (1 << 31),
-
-	SRPT_OP_FLAGS = SRPT_OP_TTI | SRPT_OP_RECV,
-};
-
-/*
- * SRP_CRED_REQ information unit, as defined in section 6.10 of the T10 SRP
- * r16a document.
- */
-struct srp_cred_req {
-	u8 opcode;
-	u8 sol_not;
-	u8 reserved[2];
-	__be32 req_lim_delta;
-	__be64 tag;
-};
+static inline u64 encode_wr_id(u8 opcode, u32 idx)
+{ return ((u64)opcode << 32) | idx; }
+static inline u8 opcode_from_wr_id(u64 wr_id)
+{ return wr_id >> 32; }
+static inline u32 idx_from_wr_id(u64 wr_id)
+{ return (u32)wr_id; }

 struct rdma_iu {
 	u64 raddr;
@@ -169,7 +148,7 @@ struct rdma_iu {
 };

 /**
- * enum srpt_command_state - SCSI command states managed by SRPT.
+ * enum srpt_command_state - SCSI command state managed by SRPT.
 * @SRPT_STATE_NEW:           New command arrived and is being processed.
 * @SRPT_STATE_NEED_DATA:     Processing a write or bidir command and waiting
 *                            for data arrival.
@@ -191,45 +170,60 @@ enum srpt_command_state {
 };

 /**
- * struct srpt_ioctx - SRPT-private data associated with a struct scst_cmd.
- * @index:     Index of the I/O context in ioctx_ring.
- * @buf:       Pointer to the message transferred via this I/O context.
- * @dma:       DMA address of buf.
- * @wait_list: Node for insertion in srpt_rdma_ch::cmd_wait_list.
- * @state:     I/O context state. See also enum srpt_command_state.
+ * struct srpt_ioctx - Shared SRPT I/O context information.
+ * @buf:   Pointer to the buffer.
+ * @dma:   DMA address of the buffer.
+ * @index: Index of the I/O context in its ioctx_ring array.
 */
 struct srpt_ioctx {
-	int index;
-	void *buf;
-	dma_addr_t dma;
-	struct rdma_iu *rdma_ius;
-	struct srp_direct_buf *rbufs;
-	struct srp_direct_buf single_rbuf;
-	struct list_head wait_list;
-	int mapped_sg_count;
-	u16 n_rdma_ius;
-	u8 n_rdma;
-	u8 n_rbuf;
+	void			*buf;
+	dma_addr_t		dma;
+	uint32_t		index;
+};

-	u64 wr_id;
-	enum ib_wc_status status;
-	enum ib_wc_opcode opcode;
-	struct srpt_rdma_ch *ch;
-	struct scst_cmd *scmnd;
-	scst_data_direction dir;
-	atomic_t state;
+/**
+ * struct srpt_recv_ioctx - SRPT receive I/O context.
+ * @ioctx:     See above.
+ * @wait_list: Node for insertion in srpt_rdma_ch.cmd_wait_list.
+ */
+struct srpt_recv_ioctx {
+	struct srpt_ioctx	ioctx;
+	struct list_head	wait_list;
+};
+
+/**
+ * struct srpt_send_ioctx - SRPT send I/O context.
+ * @ioctx:     See above.
+ * @free_list: Allows to make this struct an entry in srpt_rdma_ch.free_list.
+ * @state:     I/O context state. See also enum srpt_command_state.
+ */
+struct srpt_send_ioctx {
+	struct srpt_ioctx	ioctx;
+	struct srpt_rdma_ch	*ch;
+	struct rdma_iu		*rdma_ius;
+	struct srp_direct_buf	*rbufs;
+	struct srp_direct_buf	single_rbuf;
+	struct scatterlist	*sg;
+	struct list_head	free_list;
+	int			sg_cnt;
+	int			mapped_sg_count;
+	u16			n_rdma_ius;
+	u8			n_rdma;
+	u8			n_rbuf;
+
+	struct scst_cmd		*scmnd;
+	scst_data_direction	dir;
+	atomic_t		state;
 };

 /**
 * struct srpt_mgmt_ioctx - SCST management command context information.
 * @ioctx: SRPT I/O context associated with the management command.
- * @ch:    RDMA channel over which the management command has been received.
 * @tag:   SCSI tag of the management command.
 */
 struct srpt_mgmt_ioctx {
-	struct srpt_ioctx *ioctx;
-	struct srpt_rdma_ch *ch;
-	u64 tag;
+	struct srpt_send_ioctx	*ioctx;
+	u64			tag;
 };

 /**
@@ -259,51 +253,41 @@ enum rdma_ch_state {
 * @max_ti_iu_len: maximum target-to-initiator information unit length.
 * @supports_cred_req: whether or not the initiator supports SRP_CRED_REQ.
 * @req_lim:       request limit: maximum number of requests that may be sent
- *                 by the initiator without having received a response or
- *                 SRP_CRED_REQ.
- * @req_lim_delta: req_lim_delta to be sent in the next SRP_RSP.
- * @req_lim_waiter_count: number of threads waiting on req_lim_wait.
- * @req_lim_compl: completion variable that is signalled every time req_lim
- *                 has been incremented.
+ *                 by the initiator without having received a response.
 * @state:         channel state. See also enum rdma_ch_state.
- * @list:          node for insertion in the srpt_device::rch_list list.
+ * @list:          node for insertion in the srpt_device.rch_list list.
 * @cmd_wait_list: list of SCST commands that arrived before the RTU event. This
 *                 list contains struct srpt_ioctx elements and is protected
 *                 against concurrent modification by the cm_id spinlock.
- * @tti_head:      Index of first element of tti_ioctx that is not in use.
- * @tti_tail:      Index of first element of tti_ioctx that is in use.
- * @tti_ioctx:     Circular buffer with I/O contexts for sending requests from
- *                 target to initiator.
+ * @spinlock:      Protects free_list.
+ * @free_list:     Head of list with free send I/O contexts.
 * @scst_sess:     SCST session information associated with this SRP channel.
 * @sess_name:     SCST session name.
 */
 struct srpt_rdma_ch {
-	wait_queue_head_t wait_queue;
-	struct task_struct *thread;
-	struct ib_cm_id *cm_id;
-	struct ib_qp *qp;
-	int rq_size;
-	atomic_t processing_compl;
-	struct ib_cq *cq;
-	atomic_t sq_wr_avail;
-	struct srpt_port *sport;
-	u8 i_port_id[16];
-	u8 t_port_id[16];
-	int max_ti_iu_len;
-	atomic_t supports_cred_req;
-	atomic_t req_lim;
-	atomic_t req_lim_delta;
-	atomic_t req_lim_waiter_count;
-	struct completion req_lim_compl;
-	atomic_t state;
-	struct list_head list;
-	struct list_head cmd_wait_list;
-	int tti_head;
-	int tti_tail;
-	struct srpt_ioctx *tti_ioctx[TTI_IOCTX_COUNT];
+	wait_queue_head_t	wait_queue;
+	struct task_struct	*thread;
+	struct ib_cm_id		*cm_id;
+	struct ib_qp		*qp;
+	int			rq_size;
+	atomic_t		processing_compl;
+	struct ib_cq		*cq;
+	atomic_t		sq_wr_avail;
+	struct srpt_port	*sport;
+	u8			i_port_id[16];
+	u8			t_port_id[16];
+	int			max_ti_iu_len;
+	atomic_t		req_lim;
+	spinlock_t		spinlock;
+	struct list_head	free_list;
+	struct srpt_send_ioctx	**ioctx_ring;
+	struct ib_wc		wc[16];
+	atomic_t		state;
+	struct list_head	list;
+	struct list_head	cmd_wait_list;

-	struct scst_session *scst_sess;
-	u8 sess_name[36];
+	struct scst_session	*scst_sess;
+	u8			sess_name[36];
 };

 /**
@@ -334,9 +318,9 @@ struct srpt_port {
 * @srq:           Per-HCA SRQ (shared receive queue).
 * @cm_id:         connection identifier.
 * @dev_attr:      attributes of the InfiniBand device as obtained during the
- *                 ib_client::add() callback.
+ *                 ib_client.add() callback.
 * @ioctx_ring:    Per-HCA I/O context ring.
- * @rch_list:      per-device channel list -- see also srpt_rdma_ch::list.
+ * @rch_list:      per-device channel list -- see also srpt_rdma_ch.list.
 * @spinlock:      protects rch_list.
 * @srpt_port:     information about the ports owned by this HCA.
 * @event_handler: per-HCA asynchronous IB event handler.
@@ -352,13 +336,13 @@ struct srpt_device {
 	struct ib_cm_id *cm_id;
 	struct ib_device_attr dev_attr;
 	int srq_size;
-	struct srpt_ioctx **ioctx_ring;
+	struct srpt_recv_ioctx **ioctx_ring;
 	struct list_head rch_list;
 	spinlock_t spinlock;
 	struct srpt_port port[2];
 	struct ib_event_handler event_handler;
 #if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 26)
-	struct class_device class_dev;
+	struct class_device dev;
 #else
 	struct device dev;
 #endif