From 105fd01c57a7ecd549906a08c7e51b6dc8dbb503 Mon Sep 17 00:00:00 2001 From: Vladislav Bolkhovitin Date: Thu, 29 Apr 2010 14:00:23 +0000 Subject: [PATCH] - Residuals handling fixes, part 1 - Docs updated - Cleanups git-svn-id: http://svn.code.sf.net/p/scst/svn/trunk@1666 d57e44dd-8a1f-0410-8b47-8ef2f437770f --- scst/README | 73 ++++++++++++++++---------- scst/README_in-tree | 73 ++++++++++++++++---------- scst/include/scst_const.h | 2 - scst/src/dev_handlers/scst_changer.c | 1 - scst/src/dev_handlers/scst_processor.c | 1 - scst/src/dev_handlers/scst_raid.c | 1 - scst/src/dev_handlers/scst_user.c | 2 +- scst/src/dev_handlers/scst_vdisk.c | 27 +++------- scst/src/scst_lib.c | 7 +-- usr/fileio/common.c | 40 +++++++++----- 10 files changed, 127 insertions(+), 100 deletions(-) diff --git a/scst/README b/scst/README index fda05451c..b0439de4b 100644 --- a/scst/README +++ b/scst/README @@ -159,14 +159,6 @@ IMPORTANT: In the current version simultaneous access to local SCSI devices devices READ/WRITE commands using direct disk handler look to be safe. -IMPORTANT: Some versions of Windows have a bug, which makes them consider -========= response of READ CAPACITY(16) longer than 12 bytes as a faulty one. - As the result, such Windows'es refuse to see SCST exported - devices >2TB in size. This is fixed by MS in latter Windows - versions, probably, by some hotfix. But if you're using such - buggy Windows and experience this problem, change in - scst_vdisk.c::vdisk_exec_read_capacity16() "#if 1" to "#if 0". - To uninstall, type 'make scst_uninstall'. @@ -1293,17 +1285,37 @@ Caching ------- By default for performance reasons VDISK FILEIO devices use write back -caching policy. This is generally safe for modern applications who -prepared to work in the write back caching environments, so know when to -flush cache to keep their data consistent and minimize damage caused in -case of power/hardware/software failures by lost in the cache data. +caching policy. + +Generally, write back caching is reasonably safe for use and danger of +it is greatly overestimated, because: + +1. Modern HDDs have at least 16MB of cache working in write back mode by +default, so for a 10 drives RAID it is 160MB of a write back cache. You +can consider, how many people are happy with it and how many disabled +write back cache of their HDDs? Almost all and almost nobody +correspondingly? Moreover, many HDDs lie about state of their cache and +report write through while working in write back mode. They are also +successfully used. + +Also, at the moment we don't know any SDDs, which are acceptably fast, +i.e. faster than HDDs in average, with write back caching disabled. So +to have acceptable performance their users have to use write back +caching, hence on a power loss all not yet committed to flash chips, but +acknowledged as written, data will be lost. + +2. Most, if not all, modern enterprise level applications are well +prepared to work with write back cached storage. They know well when to +flush the cache and how to flush it to make the lost on crash data +acceptable. For instance, journaled file systems flush cache on each meta data update, so they survive power/hardware/software failures pretty well. -Note, Linux IO subsystem guarantees it work reliably only using data -protection barriers, which, for instance, for Ext3 turned off by default -(see http://lwn.net/Articles/283161). Some info about barriers from the -XFS point of view could be found at + +Note, Linux I/O subsystem guarantees this facility to work reliably only +using data protection barriers, which usually turned off by default (see +http://lwn.net/Articles/283161). Some info about barriers from the XFS +point of view could be found at http://oss.sgi.com/projects/xfs/faq.html#wcache. On Linux initiators for Ext3 and ReiserFS file systems the barrier protection could be turned on using "barrier=1" and "barrier=flush" mount options correspondingly. You @@ -1316,19 +1328,21 @@ But even in case of journaled file systems your unsaved cached data will still be lost in case of power/hardware/software failures, so you may need to supply your target server with a good UPS with possibility to gracefully shutdown your target on power shortage or disable write back -caching using WRITE_THROUGH flag. Note, on some real-life workloads -write through caching might perform better, than write back one with the -barrier protection turned on. Also note that without barriers enabled +caching using WRITE_THROUGH flag. Note, that without barriers enabled (i.e. by default) Linux doesn't provide a guarantee that after -sync()/fsync() all written data really hit permanent storage. They can -be stored in the cache of your backstorage devices and, hence, lost on a -power failure event. Thus, ever with write-through cache mode, you still -either need to enable barriers on your backend file system on the target -(for devices this is, indeed, impossible), or need a good UPS to protect -yourself from not committed data loss. +sync()/fsync() all written data really hit permanent storage (see +above). They can be stored in the cache of your backstorage devices and, +hence, lost on a power failure event. Thus, ever with write-through +cache mode, you still either need to enable barriers on your backend +file system on the target (for direct /dev/sdX devices this is, indeed, +impossible), or need a good UPS to protect yourself from not committed +data loss. -To limit this data loss you can use files in /proc/sys/vm to limit -amount of unflushed data in the system cache. +Note, on some real-life workloads write through caching might perform +better, than write back one with the barrier protection turned on. + +To limit this data loss with write back caching you can use files in +/proc/sys/vm to limit amount of unflushed data in the system cache. BLOCKIO VDISK mode @@ -1359,8 +1373,9 @@ the system cache and the commands data buffers, so it saves a considerable amount of CPU power and memory bandwidth. IMPORTANT: Since data in BLOCKIO and FILEIO modes are not consistent between -========= them, if you try to use a device in both those modes simultaneously, - you will almost instantly corrupt your data on that device. +========= each other, if you try to use a device in both those modes + simultaneously, you will almost instantly corrupt your data + on that device. Pass-through mode diff --git a/scst/README_in-tree b/scst/README_in-tree index 7dc79c345..15ff849ff 100644 --- a/scst/README_in-tree +++ b/scst/README_in-tree @@ -86,14 +86,6 @@ IMPORTANT: In the current version simultaneous access to local SCSI devices devices READ/WRITE commands using direct disk handler look to be safe. -IMPORTANT: Some versions of Windows have a bug, which makes them consider -========= response of READ CAPACITY(16) longer than 12 bytes as a faulty one. - As the result, such Windows'es refuse to see SCST exported - devices >2TB in size. This is fixed by MS in latter Windows - versions, probably, by some hotfix. But if you're using such - buggy Windows and experience this problem, change in - scst_vdisk.c::vdisk_exec_read_capacity16() "#if 1" to "#if 0". - Usage in failover mode ---------------------- @@ -879,17 +871,37 @@ Caching ------- By default for performance reasons VDISK FILEIO devices use write back -caching policy. This is generally safe for modern applications who -prepared to work in the write back caching environments, so know when to -flush cache to keep their data consistent and minimize damage caused in -case of power/hardware/software failures by lost in the cache data. +caching policy. + +Generally, write back caching is reasonably safe for use and danger of +it is greatly overestimated, because: + +1. Modern HDDs have at least 16MB of cache working in write back mode by +default, so for a 10 drives RAID it is 160MB of a write back cache. You +can consider, how many people are happy with it and how many disabled +write back cache of their HDDs? Almost all and almost nobody +correspondingly? Moreover, many HDDs lie about state of their cache and +report write through while working in write back mode. They are also +successfully used. + +Also, at the moment we don't know any SDDs, which are acceptably fast, +i.e. faster than HDDs in average, with write back caching disabled. So +to have acceptable performance their users have to use write back +caching, hence on a power loss all not yet committed to flash chips, but +acknowledged as written, data will be lost. + +2. Most, if not all, modern enterprise level applications are well +prepared to work with write back cached storage. They know well when to +flush the cache and how to flush it to make the lost on crash data +acceptable. For instance, journaled file systems flush cache on each meta data update, so they survive power/hardware/software failures pretty well. -Note, Linux IO subsystem guarantees it work reliably only using data -protection barriers, which, for instance, for Ext3 turned off by default -(see http://lwn.net/Articles/283161). Some info about barriers from the -XFS point of view could be found at + +Note, Linux I/O subsystem guarantees this facility to work reliably only +using data protection barriers, which usually turned off by default (see +http://lwn.net/Articles/283161). Some info about barriers from the XFS +point of view could be found at http://oss.sgi.com/projects/xfs/faq.html#wcache. On Linux initiators for Ext3 and ReiserFS file systems the barrier protection could be turned on using "barrier=1" and "barrier=flush" mount options correspondingly. You @@ -902,19 +914,21 @@ But even in case of journaled file systems your unsaved cached data will still be lost in case of power/hardware/software failures, so you may need to supply your target server with a good UPS with possibility to gracefully shutdown your target on power shortage or disable write back -caching using WRITE_THROUGH flag. Note, on some real-life workloads -write through caching might perform better, than write back one with the -barrier protection turned on. Also note that without barriers enabled +caching using WRITE_THROUGH flag. Note, that without barriers enabled (i.e. by default) Linux doesn't provide a guarantee that after -sync()/fsync() all written data really hit permanent storage. They can -be stored in the cache of your backstorage devices and, hence, lost on a -power failure event. Thus, ever with write-through cache mode, you still -either need to enable barriers on your backend file system on the target -(for devices this is, indeed, impossible), or need a good UPS to protect -yourself from not committed data loss. +sync()/fsync() all written data really hit permanent storage (see +above). They can be stored in the cache of your backstorage devices and, +hence, lost on a power failure event. Thus, ever with write-through +cache mode, you still either need to enable barriers on your backend +file system on the target (for direct /dev/sdX devices this is, indeed, +impossible), or need a good UPS to protect yourself from not committed +data loss. -To limit this data loss you can use files in /proc/sys/vm to limit -amount of unflushed data in the system cache. +Note, on some real-life workloads write through caching might perform +better, than write back one with the barrier protection turned on. + +To limit this data loss with write back caching you can use files in +/proc/sys/vm to limit amount of unflushed data in the system cache. BLOCKIO VDISK mode @@ -945,8 +959,9 @@ the system cache and the commands data buffers, so it saves a considerable amount of CPU power and memory bandwidth. IMPORTANT: Since data in BLOCKIO and FILEIO modes are not consistent between -========= them, if you try to use a device in both those modes simultaneously, - you will almost instantly corrupt your data on that device. +========= each other, if you try to use a device in both those modes + simultaneously, you will almost instantly corrupt your data + on that device. Pass-through mode diff --git a/scst/include/scst_const.h b/scst/include/scst_const.h index f1799397d..bf73266aa 100644 --- a/scst/include/scst_const.h +++ b/scst/include/scst_const.h @@ -310,8 +310,6 @@ enum scst_cdb_flags { ** Misc SCSI constants *************************************************************/ #define SCST_SENSE_ASC_UA_RESET 0x29 -#define READ_CAP_LEN 8 -#define READ_CAP16_LEN 32 #define BYTCHK 0x02 #define POSITION_LEN_SHORT 20 #define POSITION_LEN_LONG 32 diff --git a/scst/src/dev_handlers/scst_changer.c b/scst/src/dev_handlers/scst_changer.c index 1786fcb62..a30736f74 100644 --- a/scst/src/dev_handlers/scst_changer.c +++ b/scst/src/dev_handlers/scst_changer.c @@ -28,7 +28,6 @@ #define CHANGER_NAME "dev_changer" #define CHANGER_RETRIES 2 -#define READ_CAP_LEN 8 static int changer_attach(struct scst_device *); /* static void changer_detach(struct scst_device *); */ diff --git a/scst/src/dev_handlers/scst_processor.c b/scst/src/dev_handlers/scst_processor.c index 0a2668099..694aadd16 100644 --- a/scst/src/dev_handlers/scst_processor.c +++ b/scst/src/dev_handlers/scst_processor.c @@ -28,7 +28,6 @@ #define PROCESSOR_NAME "dev_processor" #define PROCESSOR_RETRIES 2 -#define READ_CAP_LEN 8 static int processor_attach(struct scst_device *); /*static void processor_detach(struct scst_device *);*/ diff --git a/scst/src/dev_handlers/scst_raid.c b/scst/src/dev_handlers/scst_raid.c index 91aaa5b11..06d383b83 100644 --- a/scst/src/dev_handlers/scst_raid.c +++ b/scst/src/dev_handlers/scst_raid.c @@ -28,7 +28,6 @@ #define RAID_NAME "dev_raid" #define RAID_RETRIES 2 -#define READ_CAP_LEN 8 static int raid_attach(struct scst_device *); /* static void raid_detach(struct scst_device *); */ diff --git a/scst/src/dev_handlers/scst_user.c b/scst/src/dev_handlers/scst_user.c index 3f36b723b..15da0349c 100644 --- a/scst/src/dev_handlers/scst_user.c +++ b/scst/src/dev_handlers/scst_user.c @@ -665,7 +665,7 @@ static int dev_user_alloc_space(struct scst_user_cmd *ucmd) TRACE_ENTRY(); ucmd->state = UCMD_STATE_BUF_ALLOCING; - cmd->dh_data_buf_alloced = 1; + scst_cmd_set_dh_data_buff_alloced(cmd); rc = dev_user_alloc_sg(ucmd, is_buff_cached(ucmd)); if (rc == 0) diff --git a/scst/src/dev_handlers/scst_vdisk.c b/scst/src/dev_handlers/scst_vdisk.c index 2e130be71..06991c13b 100644 --- a/scst/src/dev_handlers/scst_vdisk.c +++ b/scst/src/dev_handlers/scst_vdisk.c @@ -1933,7 +1933,7 @@ static void vdisk_exec_read_capacity(struct scst_cmd *cmd) struct scst_vdisk_dev *virt_dev; uint32_t blocksize; uint64_t nblocks; - uint8_t buffer[READ_CAP_LEN]; + uint8_t buffer[8]; TRACE_ENTRY(); @@ -1979,8 +1979,8 @@ static void vdisk_exec_read_capacity(struct scst_cmd *cmd) goto out; } - if (length > READ_CAP_LEN) - length = READ_CAP_LEN; + length = min_t(int, length, sizeof(buffer)); + memcpy(address, buffer, length); scst_put_buf(cmd, address); @@ -2000,7 +2000,7 @@ static void vdisk_exec_read_capacity16(struct scst_cmd *cmd) struct scst_vdisk_dev *virt_dev; uint32_t blocksize; uint64_t nblocks; - uint8_t buffer[READ_CAP16_LEN]; + uint8_t buffer[32]; TRACE_ENTRY(); @@ -2063,23 +2063,8 @@ static void vdisk_exec_read_capacity16(struct scst_cmd *cmd) goto out; } - /* - * Some versions of Windows have a bug, which makes them consider - * response of READ CAPACITY(16) longer than 12 bytes as a faulty one. - * As the result, such Windows'es refuse to see SCST exported - * devices >2TB in size. This is fixed by MS in latter Windows - * versions, probably, by some hotfix. - * - * But if you're using such buggy Windows and experience this problem, - * change this '1' to '0'. - */ -#if 0 /* there are too many such hosts */ - if (length > READ_CAP16_LEN) - length = READ_CAP16_LEN; -#else - if (length > 12) - length = 12; -#endif + length = min_t(int, length, sizeof(buffer)); + memcpy(address, buffer, length); scst_put_buf(cmd, address); diff --git a/scst/src/scst_lib.c b/scst/src/scst_lib.c index 47ae0373e..ca887f745 100644 --- a/scst/src/scst_lib.c +++ b/scst/src/scst_lib.c @@ -27,9 +27,10 @@ #include #include #include -#include #include #include +#include +#include #include "scst.h" #include "scst_priv.h" @@ -4500,7 +4501,7 @@ static int get_trans_len_block_limit(struct scst_cmd *cmd, uint8_t off) static int get_trans_len_read_capacity(struct scst_cmd *cmd, uint8_t off) { - cmd->bufflen = READ_CAP_LEN; + cmd->bufflen = 8; return 0; } @@ -4512,7 +4513,7 @@ static int get_trans_len_serv_act_in(struct scst_cmd *cmd, uint8_t off) if ((cmd->cdb[1] & 0x1f) == SAI_READ_CAPACITY_16) { cmd->op_name = "READ CAPACITY(16)"; - cmd->bufflen = READ_CAP16_LEN; + cmd->bufflen = be32_to_cpu(get_unaligned((__be32 *)&cmd->cdb[10])); cmd->op_flags |= SCST_IMPLICIT_HQ|SCST_REG_RESERVE_ALLOWED; } else cmd->op_flags |= SCST_UNKNOWN_LENGTH; diff --git a/usr/fileio/common.c b/usr/fileio/common.c index bf3eb8fda..eb1c27279 100644 --- a/usr/fileio/common.c +++ b/usr/fileio/common.c @@ -314,6 +314,8 @@ static int do_exec(struct vdisk_cmd *vcmd) } } + reply->resp_data_len = cmd->bufflen; + switch (opcode) { case READ_6: case WRITE_6: @@ -1059,8 +1061,11 @@ static void exec_inquiry(struct vdisk_cmd *vcmd) sBUG_ON(resp_len >= (int)sizeof(buf)); if (length > resp_len) length = resp_len; + memcpy(address, buf, length); - reply->resp_data_len = length; + + if (length < reply->resp_data_len) + reply->resp_data_len = length; out: TRACE_EXIT(); @@ -1080,8 +1085,11 @@ static void exec_request_sense(struct vdisk_cmd *vcmd) l = set_sense(b, sizeof(b), SCST_LOAD_SENSE(scst_sense_no_sense)); length = min(l, length); + memcpy(address, b, length); - reply->resp_data_len = length; + + if (length < reply->resp_data_len) + reply->resp_data_len = length; TRACE_EXIT(); return; @@ -1318,8 +1326,11 @@ static void exec_mode_sense(struct vdisk_cmd *vcmd) sBUG_ON(offset >= (int)sizeof(buf)); if (offset > length) offset = length; + memcpy(address, buf, offset); - reply->resp_data_len = offset; + + if (offset < reply->resp_data_len) + reply->resp_data_len = offset; out: TRACE_EXIT(); @@ -1420,7 +1431,7 @@ static void exec_read_capacity(struct vdisk_cmd *vcmd) uint8_t *address = (uint8_t*)(unsigned long)cmd->pbuf; uint32_t blocksize; uint64_t nblocks; - uint8_t buffer[READ_CAP_LEN]; + uint8_t buffer[8]; TRACE_ENTRY(); @@ -1445,11 +1456,12 @@ static void exec_read_capacity(struct vdisk_cmd *vcmd) buffer[6] = (blocksize >> (BYTE * 1)) & 0xFF; buffer[7] = (blocksize >> (BYTE * 0)) & 0xFF; - if (length > READ_CAP_LEN) - length = READ_CAP_LEN; + length = min(length, (int)sizeof(buffer)); + memcpy(address, buffer, length); - reply->resp_data_len = length; + if (length < reply->resp_data_len) + reply->resp_data_len = length; TRACE_EXIT(); return; @@ -1464,7 +1476,7 @@ static void exec_read_capacity16(struct vdisk_cmd *vcmd) uint8_t *address = (uint8_t*)(unsigned long)cmd->pbuf; uint32_t blocksize; uint64_t nblocks; - uint8_t buffer[READ_CAP16_LEN]; + uint8_t buffer[32]; TRACE_ENTRY(); @@ -1506,11 +1518,12 @@ static void exec_read_capacity16(struct vdisk_cmd *vcmd) break; } - if (length > READ_CAP16_LEN) - length = READ_CAP16_LEN; + length = min(length, (int)sizeof(buffer)); + memcpy(address, buffer, length); - reply->resp_data_len = length; + if (length < reply->resp_data_len) + reply->resp_data_len = length; TRACE_EXIT(); return; @@ -1585,8 +1598,11 @@ static void exec_read_toc(struct vdisk_cmd *vcmd) if (off > length) off = length; + memcpy(address, buffer, off); - reply->resp_data_len = off; + + if (off < reply->resp_data_len) + reply->resp_data_len = off; out: TRACE_EXIT();