scst_vdisk: Add zero-copy file I/O readsupport

Speeds up reading from a RAM disk via ib_srpt by about 30%.
Tested with iSCSI-SCST, ib_srpt and scst_local.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>

In it:

- Fixed NULLIO, which the original patch breaks

- Changed on all fast paths sBUG_ON() on EXTRACHECKS_BUG_ON()

- Fixed errors path in non_fileio_exec() and fileio_alloc_data_buf().

- Renamed zero_copy_read to just zero_copy.

- Docs and change log updated

- Some cleanups




git-svn-id: http://svn.code.sf.net/p/scst/svn/trunk@4213 d57e44dd-8a1f-0410-8b47-8ef2f437770f
This commit is contained in:
Vladislav Bolkhovitin
2012-04-18 01:19:28 +00:00
parent 2d71d64481
commit f742ae018e
13 changed files with 721 additions and 480 deletions

View File

@@ -1,7 +1,7 @@
Summary of changes between versions 2.1.0 and 3.0
-------------------------------------------------
- Update to kernels up to 3.1
- Update to kernels up to 3.3
- Bug fixes and other improvements

View File

@@ -1,7 +1,7 @@
Summary of changes between versions 2.1.0 and 3.0
-------------------------------------------------
- Update to kernels up to 3.1
- Update to kernels up to 3.3
- Bug fixes and other improvements

View File

@@ -1,7 +1,9 @@
Summary of changes between versions 2.1.0 and 3.0
-------------------------------------------------
- Update to kernels up to 3.1
- Update to kernels up to 3.3
- Zero copy read side FILEIO implemented
- New initialization scripts implemented

View File

@@ -886,6 +886,9 @@ cache. The following parameters possible for vdisk_fileio:
- rotational - if set, this device reported as rotational. Otherwise,
it is reported as non-rotational (SSD, etc.)
- zero_copy - if set, then this device uses zero copy access to the
page cache. At the moment, only read side zero copy is implemented.
Handler vdisk_blockio provides BLOCKIO mode to create virtual devices.
This mode performs direct block I/O with a block device, bypassing the
page cache for all operations. This mode works ideally with high-end

View File

@@ -750,6 +750,9 @@ cache. The following parameters possible for vdisk_fileio:
- rotational - if set, this device reported as rotational. Otherwise,
it is reported as non-rotational (SSD, etc.)
- zero_copy - if set, then this device uses zero copy access to the
page cache. At the moment, only read side zero copy is implemented.
Handler vdisk_blockio provides BLOCKIO mode to create virtual devices.
This mode performs direct block I/O with a block device, bypassing the
page cache for all operations. This mode works ideally with high-end

View File

@@ -2420,18 +2420,6 @@ struct scst_device {
#endif
};
/*
* Used to store threads local tgt_dev specific data
*/
struct scst_thr_data_hdr {
/* List entry in tgt_dev->thr_data_list */
struct list_head thr_data_list_entry;
struct task_struct *owner_thr; /* the owner thread */
atomic_t ref;
/* Function that will be called on the tgt_dev destruction */
void (*free_fn) (struct scst_thr_data_hdr *data);
};
/*
* Used to clearly dispose async io_context
*/
@@ -2473,10 +2461,6 @@ struct scst_tgt_dev {
struct scst_order_data *curr_order_data;
struct scst_order_data tgt_dev_order_data;
/* List of scst_thr_data_hdr and lock */
spinlock_t thr_data_lock;
struct list_head thr_data_list;
/* Pointer to lists of commands with the lock */
struct scst_cmd_threads *active_cmd_threads;
@@ -3974,7 +3958,8 @@ static inline int scst_check_local_events(struct scst_cmd *cmd)
return __scst_check_local_events(cmd, true);
}
int scst_set_cmd_abnormal_done_state(struct scst_cmd *cmd);
int scst_get_cmd_abnormal_done_state(const struct scst_cmd *cmd);
void scst_set_cmd_abnormal_done_state(struct scst_cmd *cmd);
struct scst_trace_log {
unsigned int val;
@@ -4129,34 +4114,6 @@ void scst_cmd_put(struct scst_cmd *cmd);
struct scatterlist *scst_alloc(int size, gfp_t gfp_mask, int *count);
void scst_free(struct scatterlist *sg, int count);
void scst_add_thr_data(struct scst_tgt_dev *tgt_dev,
struct scst_thr_data_hdr *data,
void (*free_fn) (struct scst_thr_data_hdr *data));
void scst_del_all_thr_data(struct scst_tgt_dev *tgt_dev);
void scst_dev_del_all_thr_data(struct scst_device *dev);
struct scst_thr_data_hdr *__scst_find_thr_data(struct scst_tgt_dev *tgt_dev,
struct task_struct *tsk);
/* Finds local to the current thread data. Returns NULL, if they not found. */
static inline struct scst_thr_data_hdr *scst_find_thr_data(
struct scst_tgt_dev *tgt_dev)
{
return __scst_find_thr_data(tgt_dev, current);
}
/* Increase ref counter for the thread data */
static inline void scst_thr_data_get(struct scst_thr_data_hdr *data)
{
atomic_inc(&data->ref);
}
/* Decrease ref counter for the thread data */
static inline void scst_thr_data_put(struct scst_thr_data_hdr *data)
{
if (atomic_dec_and_test(&data->ref))
data->free_fn(data);
}
int scst_calc_block_shift(int sector_size);
int scst_sbc_generic_parse(struct scst_cmd *cmd,
int (*get_block_shift)(struct scst_cmd *cmd));

View File

@@ -677,7 +677,7 @@ static int dev_user_alloc_space(struct scst_user_cmd *ucmd)
goto out;
else if (rc < 0) {
scst_set_busy(cmd);
res = scst_set_cmd_abnormal_done_state(cmd);
res = scst_get_cmd_abnormal_done_state(cmd);
goto out;
}
@@ -860,7 +860,7 @@ out_invalid:
scst_set_cmd_error(cmd, SCST_LOAD_SENSE(scst_sense_invalid_opcode));
out_error:
res = scst_set_cmd_abnormal_done_state(cmd);
res = scst_get_cmd_abnormal_done_state(cmd);
goto out;
}

File diff suppressed because it is too large Load Diff

View File

@@ -1872,7 +1872,7 @@ void scst_check_reassign_sessions(void)
return;
}
static int scst_get_cmd_abnormal_done_state(const struct scst_cmd *cmd)
int scst_get_cmd_abnormal_done_state(const struct scst_cmd *cmd)
{
int res;
@@ -1937,16 +1937,15 @@ static int scst_get_cmd_abnormal_done_state(const struct scst_cmd *cmd)
TRACE_EXIT_RES(res);
return res;
}
EXPORT_SYMBOL_GPL(scst_get_cmd_abnormal_done_state);
/**
* scst_set_cmd_abnormal_done_state() - set command's next abnormal done state
*
* Sets state of the SCSI target state machine to abnormally complete command
* ASAP.
*
* Returns the new state.
*/
int scst_set_cmd_abnormal_done_state(struct scst_cmd *cmd)
void scst_set_cmd_abnormal_done_state(struct scst_cmd *cmd)
{
TRACE_ENTRY();
@@ -2005,8 +2004,8 @@ int scst_set_cmd_abnormal_done_state(struct scst_cmd *cmd)
}
#endif
TRACE_EXIT_RES(cmd->state);
return cmd->state;
TRACE_EXIT();
return;
}
EXPORT_SYMBOL_GPL(scst_set_cmd_abnormal_done_state);
@@ -3375,8 +3374,6 @@ static int scst_alloc_add_tgt_dev(struct scst_session *sess,
spin_lock_init(&tgt_dev->tgt_dev_lock);
INIT_LIST_HEAD(&tgt_dev->UA_list);
spin_lock_init(&tgt_dev->thr_data_lock);
INIT_LIST_HEAD(&tgt_dev->thr_data_list);
scst_init_order_data(&tgt_dev->tgt_dev_order_data);
if (dev->tst == SCST_CONTR_MODE_SEP_TASK_SETS)
@@ -3540,8 +3537,6 @@ static void scst_free_tgt_dev(struct scst_tgt_dev *tgt_dev)
scst_tgt_dev_stop_threads(tgt_dev);
sBUG_ON(!list_empty(&tgt_dev->thr_data_list));
kmem_cache_free(scst_tgtd_cachep, tgt_dev);
TRACE_EXIT();
@@ -6363,138 +6358,6 @@ out_unlock:
return res;
}
/*****************************************************************
** The following thr_data functions are necessary, because the
** kernel doesn't provide a better way to have threads local
** storage
*****************************************************************/
/**
* scst_add_thr_data() - add the current thread's local data
*
* Adds local to the current thread data to tgt_dev
* (they will be local for the tgt_dev and current thread).
*/
void scst_add_thr_data(struct scst_tgt_dev *tgt_dev,
struct scst_thr_data_hdr *data,
void (*free_fn) (struct scst_thr_data_hdr *data))
{
data->owner_thr = current;
atomic_set(&data->ref, 1);
EXTRACHECKS_BUG_ON(free_fn == NULL);
data->free_fn = free_fn;
spin_lock(&tgt_dev->thr_data_lock);
list_add_tail(&data->thr_data_list_entry, &tgt_dev->thr_data_list);
spin_unlock(&tgt_dev->thr_data_lock);
}
EXPORT_SYMBOL_GPL(scst_add_thr_data);
/**
* scst_del_all_thr_data() - delete all thread's local data
*
* Deletes all local to threads data from tgt_dev
*/
void scst_del_all_thr_data(struct scst_tgt_dev *tgt_dev)
{
spin_lock(&tgt_dev->thr_data_lock);
while (!list_empty(&tgt_dev->thr_data_list)) {
struct scst_thr_data_hdr *d = list_entry(
tgt_dev->thr_data_list.next, typeof(*d),
thr_data_list_entry);
list_del(&d->thr_data_list_entry);
spin_unlock(&tgt_dev->thr_data_lock);
scst_thr_data_put(d);
spin_lock(&tgt_dev->thr_data_lock);
}
spin_unlock(&tgt_dev->thr_data_lock);
return;
}
EXPORT_SYMBOL_GPL(scst_del_all_thr_data);
/**
* scst_dev_del_all_thr_data() - delete all thread's local data from device
*
* Deletes all local to threads data from all tgt_dev's of the device
*/
void scst_dev_del_all_thr_data(struct scst_device *dev)
{
struct scst_tgt_dev *tgt_dev;
TRACE_ENTRY();
mutex_lock(&scst_mutex);
list_for_each_entry(tgt_dev, &dev->dev_tgt_dev_list,
dev_tgt_dev_list_entry) {
scst_del_all_thr_data(tgt_dev);
}
mutex_unlock(&scst_mutex);
TRACE_EXIT();
return;
}
EXPORT_SYMBOL_GPL(scst_dev_del_all_thr_data);
/* thr_data_lock supposed to be held */
static struct scst_thr_data_hdr *__scst_find_thr_data_locked(
struct scst_tgt_dev *tgt_dev, struct task_struct *tsk)
{
struct scst_thr_data_hdr *res = NULL, *d;
list_for_each_entry(d, &tgt_dev->thr_data_list, thr_data_list_entry) {
if (d->owner_thr == tsk) {
res = d;
scst_thr_data_get(res);
break;
}
}
return res;
}
/**
* __scst_find_thr_data() - find local to the thread data
*
* Finds local to the thread data. Returns NULL, if they not found.
*/
struct scst_thr_data_hdr *__scst_find_thr_data(struct scst_tgt_dev *tgt_dev,
struct task_struct *tsk)
{
struct scst_thr_data_hdr *res;
spin_lock(&tgt_dev->thr_data_lock);
res = __scst_find_thr_data_locked(tgt_dev, tsk);
spin_unlock(&tgt_dev->thr_data_lock);
return res;
}
EXPORT_SYMBOL_GPL(__scst_find_thr_data);
bool scst_del_thr_data(struct scst_tgt_dev *tgt_dev, struct task_struct *tsk)
{
bool res;
struct scst_thr_data_hdr *td;
spin_lock(&tgt_dev->thr_data_lock);
td = __scst_find_thr_data_locked(tgt_dev, tsk);
if (td != NULL) {
list_del(&td->thr_data_list_entry);
res = true;
} else
res = false;
spin_unlock(&tgt_dev->thr_data_lock);
if (td != NULL) {
/* the find() fn also gets it */
scst_thr_data_put(td);
scst_thr_data_put(td);
}
return res;
}
static void __scst_unblock_deferred(struct scst_order_data *order_data,
struct scst_cmd *out_of_sn_cmd)
{

View File

@@ -1717,7 +1717,6 @@ void scst_del_threads(struct scst_cmd_threads *cmd_threads, int num)
list_for_each_entry_safe_reverse(ct, tmp, &cmd_threads->threads_list,
thread_list_entry) {
int rc;
struct scst_device *dev;
rc = kthread_stop(ct->cmd_thread);
if (rc != 0 && rc != -EINTR)
@@ -1725,14 +1724,6 @@ void scst_del_threads(struct scst_cmd_threads *cmd_threads, int num)
list_del(&ct->thread_list_entry);
list_for_each_entry(dev, &scst_dev_list, dev_list_entry) {
struct scst_tgt_dev *tgt_dev;
list_for_each_entry(tgt_dev, &dev->dev_tgt_dev_list,
dev_tgt_dev_list_entry) {
scst_del_thr_data(tgt_dev, ct->cmd_thread);
}
}
kfree(ct);
cmd_threads->nr_threads--;

View File

@@ -233,9 +233,6 @@ extern void scst_stop_dev_threads(struct scst_device *dev);
extern int scst_tgt_dev_setup_threads(struct scst_tgt_dev *tgt_dev);
extern void scst_tgt_dev_stop_threads(struct scst_tgt_dev *tgt_dev);
extern bool scst_del_thr_data(struct scst_tgt_dev *tgt_dev,
struct task_struct *tsk);
extern struct scst_dev_type scst_null_devtype;
extern struct scst_cmd *__scst_check_deferred_commands(

View File

@@ -597,7 +597,8 @@ static int scst_parse_cmd(struct scst_cmd *cmd)
} else
state = SCST_CMD_STATE_PREPARE_SPACE;
if (unlikely(state == SCST_CMD_STATE_PRE_XMIT_RESP))
if (unlikely(state == SCST_CMD_STATE_PRE_XMIT_RESP) ||
unlikely(state == SCST_CMD_STATE_PREPROCESSING_DONE))
goto set_res;
if (unlikely(!(cmd->op_flags & SCST_INFO_VALID))) {
@@ -815,6 +816,7 @@ set_res:
case SCST_CMD_STATE_PREPARE_SPACE:
case SCST_CMD_STATE_PARSE:
case SCST_CMD_STATE_RDY_TO_XFER:
case SCST_CMD_STATE_PREPROCESSING_DONE:
case SCST_CMD_STATE_TGT_PRE_EXEC:
case SCST_CMD_STATE_EXEC_CHECK_SN:
case SCST_CMD_STATE_EXEC_CHECK_BLOCKING:
@@ -853,10 +855,6 @@ set_res:
cmd->resp_data_len = 0;
}
/* We already completed (with an error) */
if (unlikely(cmd->completed))
goto out_done;
#ifndef CONFIG_SCST_TEST_IO_IN_SIRQ
/*
* We can't allow atomic command on the exec stages. It shouldn't

View File

@@ -49,50 +49,6 @@
<h1>Possible SCST extensions and improvements</h1>
<A NAME="ZC_READ"></A><h3>Zero-copy FILEIO for READ-direction commands</h3>
<p>At the moment, SCST in FILEIO mode uses standard Linux read() and write() syscalls paths,
which copy data from the page cache to the supplied buffer and back. Zero-copy FILEIO
would use page cache data directly. This would be a major performance improvement,
especially for fast hardware, like Infiniband, because it would eliminate the data copy
latency as well as considerably ease CPU and memory bandwidth load. This proposal is limited for
READs only, because for WRITEs it is a lot harder to
implement, so it is worth to do zero-copy for READs and WRITEs separately.</p>
<p>The main idea is to add one more flag to filp_open() "flags" parameter
(like O_RDONLY, O_DIRECT, etc.) O_ZEROCOPY, which would be available
only if the caller is from the kernel space. In this case fd->f_op->readv(),
do_sync_readv_writev(), etc. would receive as the pointer to data
buffer not a real data buffer, but pointer to an empty SG vector. Then:</p>
<ul>
<li><span>Generic buffer allocation in SCST would not be used, instead vdisk_parse()
would allocate the SG vector, but wouldn't fill it with actual pages.</span></li>
<li><span>In generic_file_aio_read(), if O_ZEROCOPY flag was set,
function do_generic_file_read() would be called with the last parameter set
to a pointer to new function file_zero_copy_read_actor() instead of file_read_actor().</span></li>
<li><span>Function file_zero_copy_read_actor() would be basically the same as
file_read_actor(), but, instead of copy data using __copy_to_user*() functions,
it would add the supplied page to the appropriate place in the received in
desc->arg.buf SG vector and reference, i.e. page_get(), that page.</span></li>
<li><span>In vdisk_devtype.on_free_cmd(), which doesn't exist yet, all pages
from the SG vector would be dereferenced, i.e. page_put(). Then the SG vector itself
would be freed.</span></li>
</ul>
<p>That's all. For WRITEs the current code path would remain unchanged.</p>
<A NAME="ZC_WRITE"></A><h3>Zero-copy FILEIO for WRITE-direction commands</h3>
<p>Implementation should be similar to zero-copy FILEIO for READ commands and should
be done after it. All incoming data should be inserted in the page cache, then dereferenced in
vdisk_devtype.on_free_cmd(). The main problem is insertion of data pages in the
page cache, namely, locking issues related to it. They should be carefully
investigated.</p>
<A NAME="O_DIRECT"></A><h3>Support for O_DIRECT in scst_vdisk handler</h3>
<p>At the moment, scst_vdisk handler doesn't support O_DIRECT option and possibility to set it
@@ -108,7 +64,7 @@
<A NAME="SG_LIMIT"></A><h3>Solve SG IO count limitation issue in pass-through mode</h3>
<p>In the pass-through mode (i.e. using the pass-through device handlers like
scst_tape, etc) SCSI commands, coming from remote initiators,
scst_tape, etc.) SCSI commands, coming from remote initiators,
are passed to local SCSI hardware on target as is, without any
modifications. As any other hardware, the local SCSI hardware can not
handle commands with amount of data and/or segments count in
@@ -122,8 +78,6 @@
messages like: "Unable to complete command due to SG IO count
limitation" are printed in the kernel logs.</p>
<p>
<p>The only complete way to fix this problem is to allocate data buffers with number
of entries inside the SG IO count limitation. In <a href="sgv_big_order_alloc.diff">sgv_big_order_alloc.diff</a>
you can find a possible way to solve this issue.</p>
@@ -133,6 +87,8 @@
created by Frank Zago for SCST 2.0.0. It was submitted too late to be included in it.
Update for SCST trunk is welcome!</p>
<p>Note, scst_disk handler already implements a workaround for it.</p>
<A NAME="MEM_REG"></A><h3>Memory registration</h3>
<p>In some cases a target driver might need to register memory used for data buffers in the