From 105fd01c57a7ecd549906a08c7e51b6dc8dbb503 Mon Sep 17 00:00:00 2001
From: Vladislav Bolkhovitin <vst@vlnb.net>
Date: Thu, 29 Apr 2010 14:00:23 +0000
Subject: [PATCH]  - Residuals handling fixes, part 1  - Docs updated  -
 Cleanups

git-svn-id: http://svn.code.sf.net/p/scst/svn/trunk@1666 d57e44dd-8a1f-0410-8b47-8ef2f437770f
---
 scst/README                            | 73 ++++++++++++++++----------
 scst/README_in-tree                    | 73 ++++++++++++++++----------
 scst/include/scst_const.h              |  2 -
 scst/src/dev_handlers/scst_changer.c   |  1 -
 scst/src/dev_handlers/scst_processor.c |  1 -
 scst/src/dev_handlers/scst_raid.c      |  1 -
 scst/src/dev_handlers/scst_user.c      |  2 +-
 scst/src/dev_handlers/scst_vdisk.c     | 27 +++-------
 scst/src/scst_lib.c                    |  7 +--
 usr/fileio/common.c                    | 40 +++++++++-----
 10 files changed, 127 insertions(+), 100 deletions(-)

diff --git a/scst/README b/scst/README
index fda05451c..b0439de4b 100644
--- a/scst/README
+++ b/scst/README
@@ -159,14 +159,6 @@ IMPORTANT: In the current version simultaneous access to local SCSI devices
 	   devices READ/WRITE commands using direct disk handler look to
 	   be safe.
 
-IMPORTANT: Some versions of Windows have a bug, which makes them consider
-=========  response of READ CAPACITY(16) longer than 12 bytes as a faulty one.
-	   As the result, such Windows'es refuse to see SCST exported
-           devices >2TB in size. This is fixed by MS in latter Windows
-	   versions, probably, by some hotfix. But if you're using such
-	   buggy Windows and experience this problem, change in
-	   scst_vdisk.c::vdisk_exec_read_capacity16() "#if 1" to "#if 0".
-
 To uninstall, type 'make scst_uninstall'.
 
 
@@ -1293,17 +1285,37 @@ Caching
 -------
 
 By default for performance reasons VDISK FILEIO devices use write back
-caching policy. This is generally safe for modern applications who
-prepared to work in the write back caching environments, so know when to
-flush cache to keep their data consistent and minimize damage caused in
-case of power/hardware/software failures by lost in the cache data.
+caching policy. 
+
+Generally, write back caching is reasonably safe for use and danger of
+it is greatly overestimated, because:
+
+1. Modern HDDs have at least 16MB of cache working in write back mode by 
+default, so for a 10 drives RAID it is 160MB of a write back cache. You
+can consider, how  many people are happy with it and how many disabled
+write back cache of  their HDDs? Almost all and almost nobody
+correspondingly? Moreover, many  HDDs lie about state of their cache and
+report write through while  working in write back mode. They are also
+successfully used.
+
+Also, at the moment we don't know any SDDs, which are acceptably fast,
+i.e. faster than HDDs in average, with write back caching disabled. So
+to have acceptable performance their users have to use write back
+caching, hence on a power loss all not yet committed to flash chips, but
+acknowledged as written, data will be lost.
+
+2. Most, if not all, modern enterprise level applications are well 
+prepared to work with write back cached storage. They know well when to 
+flush the cache and how to flush it to make the lost on crash data 
+acceptable.
 
 For instance, journaled file systems flush cache on each meta data
 update, so they survive power/hardware/software failures pretty well.
-Note, Linux IO subsystem guarantees it work reliably only using data
-protection barriers, which, for instance, for Ext3 turned off by default
-(see http://lwn.net/Articles/283161). Some info about barriers from the
-XFS point of view could be found at
+
+Note, Linux I/O subsystem guarantees this facility to work reliably only
+using data protection barriers, which usually turned off by default (see
+http://lwn.net/Articles/283161). Some info about barriers from the XFS
+point of view could be found at
 http://oss.sgi.com/projects/xfs/faq.html#wcache. On Linux initiators for
 Ext3 and ReiserFS file systems the barrier protection could be turned on
 using "barrier=1" and "barrier=flush" mount options correspondingly. You
@@ -1316,19 +1328,21 @@ But even in case of journaled file systems your unsaved cached data will
 still be lost in case of power/hardware/software failures, so you may
 need to supply your target server with a good UPS with possibility to
 gracefully shutdown your target on power shortage or disable write back
-caching using WRITE_THROUGH flag. Note, on some real-life workloads
-write through caching might perform better, than write back one with the
-barrier protection turned on. Also note that without barriers enabled
+caching using WRITE_THROUGH flag. Note, that without barriers enabled
 (i.e. by default) Linux doesn't provide a guarantee that after
-sync()/fsync() all written data really hit permanent storage. They can
-be stored in the cache of your backstorage devices and, hence, lost on a
-power failure event. Thus, ever with write-through cache mode, you still
-either need to enable barriers on your backend file system on the target
-(for devices this is, indeed, impossible), or need a good UPS to protect
-yourself from not committed data loss.
+sync()/fsync() all written data really hit permanent storage (see
+above). They can be stored in the cache of your backstorage devices and,
+hence, lost on a power failure event. Thus, ever with write-through
+cache mode, you still either need to enable barriers on your backend
+file system on the target (for direct /dev/sdX devices this is, indeed,
+impossible), or need a good UPS to protect yourself from not committed
+data loss.
 
-To limit this data loss you can use files in /proc/sys/vm to limit
-amount of unflushed data in the system cache.
+Note, on some real-life workloads write through caching might perform
+better, than write back one with the barrier protection turned on. 
+
+To limit this data loss with write back caching you can use files in
+/proc/sys/vm to limit amount of unflushed data in the system cache.
 
 
 BLOCKIO VDISK mode
@@ -1359,8 +1373,9 @@ the system cache and the commands data buffers, so it saves a
 considerable amount of CPU power and memory bandwidth.
 
 IMPORTANT: Since data in BLOCKIO and FILEIO modes are not consistent between
-=========  them, if you try to use a device in both those modes simultaneously,
-           you will almost instantly corrupt your data on that device.
+=========  each other, if you try to use a device in both those modes
+	   simultaneously, you will almost instantly corrupt your data
+	   on that device.
 
 
 Pass-through mode
diff --git a/scst/README_in-tree b/scst/README_in-tree
index 7dc79c345..15ff849ff 100644
--- a/scst/README_in-tree
+++ b/scst/README_in-tree
@@ -86,14 +86,6 @@ IMPORTANT: In the current version simultaneous access to local SCSI devices
 	   devices READ/WRITE commands using direct disk handler look to
 	   be safe.
 
-IMPORTANT: Some versions of Windows have a bug, which makes them consider
-=========  response of READ CAPACITY(16) longer than 12 bytes as a faulty one.
-	   As the result, such Windows'es refuse to see SCST exported
-           devices >2TB in size. This is fixed by MS in latter Windows
-	   versions, probably, by some hotfix. But if you're using such
-	   buggy Windows and experience this problem, change in
-	   scst_vdisk.c::vdisk_exec_read_capacity16() "#if 1" to "#if 0".
-
 
 Usage in failover mode
 ----------------------
@@ -879,17 +871,37 @@ Caching
 -------
 
 By default for performance reasons VDISK FILEIO devices use write back
-caching policy. This is generally safe for modern applications who
-prepared to work in the write back caching environments, so know when to
-flush cache to keep their data consistent and minimize damage caused in
-case of power/hardware/software failures by lost in the cache data.
+caching policy. 
+
+Generally, write back caching is reasonably safe for use and danger of
+it is greatly overestimated, because:
+
+1. Modern HDDs have at least 16MB of cache working in write back mode by 
+default, so for a 10 drives RAID it is 160MB of a write back cache. You
+can consider, how  many people are happy with it and how many disabled
+write back cache of  their HDDs? Almost all and almost nobody
+correspondingly? Moreover, many  HDDs lie about state of their cache and
+report write through while  working in write back mode. They are also
+successfully used.
+
+Also, at the moment we don't know any SDDs, which are acceptably fast,
+i.e. faster than HDDs in average, with write back caching disabled. So
+to have acceptable performance their users have to use write back
+caching, hence on a power loss all not yet committed to flash chips, but
+acknowledged as written, data will be lost.
+
+2. Most, if not all, modern enterprise level applications are well 
+prepared to work with write back cached storage. They know well when to 
+flush the cache and how to flush it to make the lost on crash data 
+acceptable.
 
 For instance, journaled file systems flush cache on each meta data
 update, so they survive power/hardware/software failures pretty well.
-Note, Linux IO subsystem guarantees it work reliably only using data
-protection barriers, which, for instance, for Ext3 turned off by default
-(see http://lwn.net/Articles/283161). Some info about barriers from the
-XFS point of view could be found at
+
+Note, Linux I/O subsystem guarantees this facility to work reliably only
+using data protection barriers, which usually turned off by default (see
+http://lwn.net/Articles/283161). Some info about barriers from the XFS
+point of view could be found at
 http://oss.sgi.com/projects/xfs/faq.html#wcache. On Linux initiators for
 Ext3 and ReiserFS file systems the barrier protection could be turned on
 using "barrier=1" and "barrier=flush" mount options correspondingly. You
@@ -902,19 +914,21 @@ But even in case of journaled file systems your unsaved cached data will
 still be lost in case of power/hardware/software failures, so you may
 need to supply your target server with a good UPS with possibility to
 gracefully shutdown your target on power shortage or disable write back
-caching using WRITE_THROUGH flag. Note, on some real-life workloads
-write through caching might perform better, than write back one with the
-barrier protection turned on. Also note that without barriers enabled
+caching using WRITE_THROUGH flag. Note, that without barriers enabled
 (i.e. by default) Linux doesn't provide a guarantee that after
-sync()/fsync() all written data really hit permanent storage. They can
-be stored in the cache of your backstorage devices and, hence, lost on a
-power failure event. Thus, ever with write-through cache mode, you still
-either need to enable barriers on your backend file system on the target
-(for devices this is, indeed, impossible), or need a good UPS to protect
-yourself from not committed data loss.
+sync()/fsync() all written data really hit permanent storage (see
+above). They can be stored in the cache of your backstorage devices and,
+hence, lost on a power failure event. Thus, ever with write-through
+cache mode, you still either need to enable barriers on your backend
+file system on the target (for direct /dev/sdX devices this is, indeed,
+impossible), or need a good UPS to protect yourself from not committed
+data loss.
 
-To limit this data loss you can use files in /proc/sys/vm to limit
-amount of unflushed data in the system cache.
+Note, on some real-life workloads write through caching might perform
+better, than write back one with the barrier protection turned on. 
+
+To limit this data loss with write back caching you can use files in
+/proc/sys/vm to limit amount of unflushed data in the system cache.
 
 
 BLOCKIO VDISK mode
@@ -945,8 +959,9 @@ the system cache and the commands data buffers, so it saves a
 considerable amount of CPU power and memory bandwidth.
 
 IMPORTANT: Since data in BLOCKIO and FILEIO modes are not consistent between
-=========  them, if you try to use a device in both those modes simultaneously,
-           you will almost instantly corrupt your data on that device.
+=========  each other, if you try to use a device in both those modes
+	   simultaneously, you will almost instantly corrupt your data
+	   on that device.
 
 
 Pass-through mode
diff --git a/scst/include/scst_const.h b/scst/include/scst_const.h
index f1799397d..bf73266aa 100644
--- a/scst/include/scst_const.h
+++ b/scst/include/scst_const.h
@@ -310,8 +310,6 @@ enum scst_cdb_flags {
  ** Misc SCSI constants
  *************************************************************/
 #define SCST_SENSE_ASC_UA_RESET      0x29
-#define READ_CAP_LEN		     8
-#define READ_CAP16_LEN		     32
 #define BYTCHK			     0x02
 #define POSITION_LEN_SHORT           20
 #define POSITION_LEN_LONG            32
diff --git a/scst/src/dev_handlers/scst_changer.c b/scst/src/dev_handlers/scst_changer.c
index 1786fcb62..a30736f74 100644
--- a/scst/src/dev_handlers/scst_changer.c
+++ b/scst/src/dev_handlers/scst_changer.c
@@ -28,7 +28,6 @@
 #define CHANGER_NAME	"dev_changer"
 
 #define CHANGER_RETRIES       2
-#define READ_CAP_LEN          8
 
 static int changer_attach(struct scst_device *);
 /* static void changer_detach(struct scst_device *); */
diff --git a/scst/src/dev_handlers/scst_processor.c b/scst/src/dev_handlers/scst_processor.c
index 0a2668099..694aadd16 100644
--- a/scst/src/dev_handlers/scst_processor.c
+++ b/scst/src/dev_handlers/scst_processor.c
@@ -28,7 +28,6 @@
 #define PROCESSOR_NAME	"dev_processor"
 
 #define PROCESSOR_RETRIES	2
-#define READ_CAP_LEN		8
 
 static int processor_attach(struct scst_device *);
 /*static void processor_detach(struct scst_device *);*/
diff --git a/scst/src/dev_handlers/scst_raid.c b/scst/src/dev_handlers/scst_raid.c
index 91aaa5b11..06d383b83 100644
--- a/scst/src/dev_handlers/scst_raid.c
+++ b/scst/src/dev_handlers/scst_raid.c
@@ -28,7 +28,6 @@
 #define RAID_NAME	"dev_raid"
 
 #define RAID_RETRIES		2
-#define READ_CAP_LEN		8
 
 static int raid_attach(struct scst_device *);
 /* static void raid_detach(struct scst_device *); */
diff --git a/scst/src/dev_handlers/scst_user.c b/scst/src/dev_handlers/scst_user.c
index 3f36b723b..15da0349c 100644
--- a/scst/src/dev_handlers/scst_user.c
+++ b/scst/src/dev_handlers/scst_user.c
@@ -665,7 +665,7 @@ static int dev_user_alloc_space(struct scst_user_cmd *ucmd)
 	TRACE_ENTRY();
 
 	ucmd->state = UCMD_STATE_BUF_ALLOCING;
-	cmd->dh_data_buf_alloced = 1;
+	scst_cmd_set_dh_data_buff_alloced(cmd);
 
 	rc = dev_user_alloc_sg(ucmd, is_buff_cached(ucmd));
 	if (rc == 0)
diff --git a/scst/src/dev_handlers/scst_vdisk.c b/scst/src/dev_handlers/scst_vdisk.c
index 2e130be71..06991c13b 100644
--- a/scst/src/dev_handlers/scst_vdisk.c
+++ b/scst/src/dev_handlers/scst_vdisk.c
@@ -1933,7 +1933,7 @@ static void vdisk_exec_read_capacity(struct scst_cmd *cmd)
 	struct scst_vdisk_dev *virt_dev;
 	uint32_t blocksize;
 	uint64_t nblocks;
-	uint8_t buffer[READ_CAP_LEN];
+	uint8_t buffer[8];
 
 	TRACE_ENTRY();
 
@@ -1979,8 +1979,8 @@ static void vdisk_exec_read_capacity(struct scst_cmd *cmd)
 		goto out;
 	}
 
-	if (length > READ_CAP_LEN)
-		length = READ_CAP_LEN;
+	length = min_t(int, length, sizeof(buffer));
+
 	memcpy(address, buffer, length);
 
 	scst_put_buf(cmd, address);
@@ -2000,7 +2000,7 @@ static void vdisk_exec_read_capacity16(struct scst_cmd *cmd)
 	struct scst_vdisk_dev *virt_dev;
 	uint32_t blocksize;
 	uint64_t nblocks;
-	uint8_t buffer[READ_CAP16_LEN];
+	uint8_t buffer[32];
 
 	TRACE_ENTRY();
 
@@ -2063,23 +2063,8 @@ static void vdisk_exec_read_capacity16(struct scst_cmd *cmd)
 		goto out;
 	}
 
-	/*
-	 * Some versions of Windows have a bug, which makes them consider
-	 * response of READ CAPACITY(16) longer than 12 bytes as a faulty one.
-	 * As the result, such Windows'es refuse to see SCST exported
-	 * devices >2TB in size. This is fixed by MS in latter Windows
-	 * versions, probably, by some hotfix.
-	 *
-	 * But if you're using such buggy Windows and experience this problem,
-	 * change this '1' to '0'.
-	 */
-#if 0	/* there are too many such hosts */
-	if (length > READ_CAP16_LEN)
-		length = READ_CAP16_LEN;
-#else
-	if (length > 12)
-		length = 12;
-#endif
+	length = min_t(int, length, sizeof(buffer));
+
 	memcpy(address, buffer, length);
 
 	scst_put_buf(cmd, address);
diff --git a/scst/src/scst_lib.c b/scst/src/scst_lib.c
index 47ae0373e..ca887f745 100644
--- a/scst/src/scst_lib.c
+++ b/scst/src/scst_lib.c
@@ -27,9 +27,10 @@
 #include <linux/cdrom.h>
 #include <linux/unistd.h>
 #include <linux/string.h>
-#include <asm/kmap_types.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
+#include <asm/kmap_types.h>
+#include <asm/unaligned.h>
 
 #include "scst.h"
 #include "scst_priv.h"
@@ -4500,7 +4501,7 @@ static int get_trans_len_block_limit(struct scst_cmd *cmd, uint8_t off)
 
 static int get_trans_len_read_capacity(struct scst_cmd *cmd, uint8_t off)
 {
-	cmd->bufflen = READ_CAP_LEN;
+	cmd->bufflen = 8;
 	return 0;
 }
 
@@ -4512,7 +4513,7 @@ static int get_trans_len_serv_act_in(struct scst_cmd *cmd, uint8_t off)
 
 	if ((cmd->cdb[1] & 0x1f) == SAI_READ_CAPACITY_16) {
 		cmd->op_name = "READ CAPACITY(16)";
-		cmd->bufflen = READ_CAP16_LEN;
+		cmd->bufflen = be32_to_cpu(get_unaligned((__be32 *)&cmd->cdb[10]));
 		cmd->op_flags |= SCST_IMPLICIT_HQ|SCST_REG_RESERVE_ALLOWED;
 	} else
 		cmd->op_flags |= SCST_UNKNOWN_LENGTH;
diff --git a/usr/fileio/common.c b/usr/fileio/common.c
index bf3eb8fda..eb1c27279 100644
--- a/usr/fileio/common.c
+++ b/usr/fileio/common.c
@@ -314,6 +314,8 @@ static int do_exec(struct vdisk_cmd *vcmd)
 		}
 	}
 
+	reply->resp_data_len = cmd->bufflen;
+
 	switch (opcode) {
 	case READ_6:
 	case WRITE_6:
@@ -1059,8 +1061,11 @@ static void exec_inquiry(struct vdisk_cmd *vcmd)
 	sBUG_ON(resp_len >= (int)sizeof(buf));
 	if (length > resp_len)
 		length = resp_len;
+
 	memcpy(address, buf, length);
-	reply->resp_data_len = length;
+
+	if (length < reply->resp_data_len)
+		reply->resp_data_len = length;
 
 out:
 	TRACE_EXIT();
@@ -1080,8 +1085,11 @@ static void exec_request_sense(struct vdisk_cmd *vcmd)
 	l = set_sense(b, sizeof(b), SCST_LOAD_SENSE(scst_sense_no_sense));
 
 	length = min(l, length);
+
 	memcpy(address, b, length);
-	reply->resp_data_len = length;
+
+	if (length < reply->resp_data_len)
+		reply->resp_data_len = length;
 
 	TRACE_EXIT();
 	return;
@@ -1318,8 +1326,11 @@ static void exec_mode_sense(struct vdisk_cmd *vcmd)
 	sBUG_ON(offset >= (int)sizeof(buf));
 	if (offset > length)
 		offset = length;
+
 	memcpy(address, buf, offset);
-	reply->resp_data_len = offset;
+
+	if (offset < reply->resp_data_len)
+		reply->resp_data_len = offset;
 
 out:
 	TRACE_EXIT();
@@ -1420,7 +1431,7 @@ static void exec_read_capacity(struct vdisk_cmd *vcmd)
 	uint8_t *address = (uint8_t*)(unsigned long)cmd->pbuf;
 	uint32_t blocksize;
 	uint64_t nblocks;
-	uint8_t buffer[READ_CAP_LEN];
+	uint8_t buffer[8];
 
 	TRACE_ENTRY();
 
@@ -1445,11 +1456,12 @@ static void exec_read_capacity(struct vdisk_cmd *vcmd)
 	buffer[6] = (blocksize >> (BYTE * 1)) & 0xFF;
 	buffer[7] = (blocksize >> (BYTE * 0)) & 0xFF;
 
-	if (length > READ_CAP_LEN)
-		length = READ_CAP_LEN;
+	length = min(length, (int)sizeof(buffer));	
+
 	memcpy(address, buffer, length);
 
-	reply->resp_data_len = length;
+	if (length < reply->resp_data_len)
+		reply->resp_data_len = length;
 
 	TRACE_EXIT();
 	return;
@@ -1464,7 +1476,7 @@ static void exec_read_capacity16(struct vdisk_cmd *vcmd)
 	uint8_t *address = (uint8_t*)(unsigned long)cmd->pbuf;
 	uint32_t blocksize;
 	uint64_t nblocks;
-	uint8_t buffer[READ_CAP16_LEN];
+	uint8_t buffer[32];
 
 	TRACE_ENTRY();
 
@@ -1506,11 +1518,12 @@ static void exec_read_capacity16(struct vdisk_cmd *vcmd)
 		break;
 	}
 
-	if (length > READ_CAP16_LEN)
-		length = READ_CAP16_LEN;
+	length = min(length, (int)sizeof(buffer));	
+
 	memcpy(address, buffer, length);
 
-	reply->resp_data_len = length;
+	if (length < reply->resp_data_len)
+		reply->resp_data_len = length;
 
 	TRACE_EXIT();
 	return;
@@ -1585,8 +1598,11 @@ static void exec_read_toc(struct vdisk_cmd *vcmd)
 
 	if (off > length)
 		off = length;
+
 	memcpy(address, buffer, off);
-	reply->resp_data_len = off;
+
+	if (off < reply->resp_data_len)
+		reply->resp_data_len = off;
 
 out:
 	TRACE_EXIT();