diff --git a/scst/README b/scst/README index 708305826..d7d786feb 100644 --- a/scst/README +++ b/scst/README @@ -84,6 +84,10 @@ subsystem and greatly improves performance for software RAIDs. See http://sourceforge.net/mailarchive/forum.php?thread_name=a0272b440906030714g67eabc5k8f847fb1e538cc62%40mail.gmail.com&forum_name=scst-devel thread for more details. +4. readahead-context-2.6.X.patch. This is backported version of the +context readahead patch http://lkml.org/lkml/2009/4/12/9, big thanks to +Wu Fengguang! This is a performance improvement patch. + Then, to compile SCST type 'make scst'. It will build SCST itself and its device handlers. To install them type 'make scst_install'. The driver modules will be installed in '/lib/modules/`you_kernel_version`/extra'. diff --git a/scst/kernel/readahead-context-2.6.29.patch b/scst/kernel/readahead-context-2.6.29.patch new file mode 100644 index 000000000..2ba8809ed --- /dev/null +++ b/scst/kernel/readahead-context-2.6.29.patch @@ -0,0 +1,231 @@ +--- linux.orig/mm/readahead.c ++++ linux/mm/readahead.c +@@ -337,6 +337,59 @@ static unsigned long get_next_ra_size(st + */ + + /* ++ * Count contiguously cached pages from @offset-1 to @offset-@max, ++ * this count is a conservative estimation of ++ * - length of the sequential read sequence, or ++ * - thrashing threshold in memory tight systems ++ */ ++static pgoff_t count_history_pages(struct address_space *mapping, ++ struct file_ra_state *ra, ++ pgoff_t offset, unsigned long max) ++{ ++ pgoff_t head; ++ ++ rcu_read_lock(); ++ head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); ++ rcu_read_unlock(); ++ ++ return offset - 1 - head; ++} ++ ++/* ++ * page cache context based read-ahead ++ */ ++static int try_context_readahead(struct address_space *mapping, ++ struct file_ra_state *ra, ++ pgoff_t offset, ++ unsigned long req_size, ++ unsigned long max) ++{ ++ pgoff_t size; ++ ++ size = count_history_pages(mapping, ra, offset, max); ++ ++ /* ++ * no history pages: ++ * it could be a random read ++ */ ++ if (!size) ++ return 0; ++ ++ /* ++ * starts from beginning of file: ++ * it is a strong indication of long-run stream (or whole-file-read) ++ */ ++ if (size >= offset) ++ size *= 2; ++ ++ ra->start = offset; ++ ra->size = get_init_ra_size(size + req_size, max); ++ ra->async_size = ra->size; ++ ++ return 1; ++} ++ ++/* + * A minimal readahead algorithm for trivial sequential/random reads. + */ + static unsigned long +@@ -345,34 +398,26 @@ ondemand_readahead(struct address_space + bool hit_readahead_marker, pgoff_t offset, + unsigned long req_size) + { +- int max = ra->ra_pages; /* max readahead pages */ +- pgoff_t prev_offset; +- int sequential; ++ unsigned long max = max_sane_readahead(ra->ra_pages); ++ ++ /* ++ * start of file ++ */ ++ if (!offset) ++ goto initial_readahead; + + /* + * It's the expected callback offset, assume sequential access. + * Ramp up sizes, and push forward the readahead window. + */ +- if (offset && (offset == (ra->start + ra->size - ra->async_size) || +- offset == (ra->start + ra->size))) { ++ if ((offset == (ra->start + ra->size - ra->async_size) || ++ offset == (ra->start + ra->size))) { + ra->start += ra->size; + ra->size = get_next_ra_size(ra, max); + ra->async_size = ra->size; + goto readit; + } + +- prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; +- sequential = offset - prev_offset <= 1UL || req_size > max; +- +- /* +- * Standalone, small read. +- * Read as is, and do not pollute the readahead state. +- */ +- if (!hit_readahead_marker && !sequential) { +- return __do_page_cache_readahead(mapping, filp, +- offset, req_size, 0); +- } +- + /* + * Hit a marked page without valid readahead state. + * E.g. interleaved reads. +@@ -383,7 +428,7 @@ ondemand_readahead(struct address_space + pgoff_t start; + + rcu_read_lock(); +- start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); ++ start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); + rcu_read_unlock(); + + if (!start || start - offset > max) +@@ -391,23 +436,53 @@ ondemand_readahead(struct address_space + + ra->start = start; + ra->size = start - offset; /* old async_size */ ++ ra->size += req_size; + ra->size = get_next_ra_size(ra, max); + ra->async_size = ra->size; + goto readit; + } + + /* +- * It may be one of +- * - first read on start of file +- * - sequential cache miss +- * - oversize random read +- * Start readahead for it. ++ * oversize read ++ */ ++ if (req_size > max) ++ goto initial_readahead; ++ ++ /* ++ * sequential cache miss + */ ++ if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) ++ goto initial_readahead; ++ ++ /* ++ * Query the page cache and look for the traces(cached history pages) ++ * that a sequential stream would leave behind. ++ */ ++ if (try_context_readahead(mapping, ra, offset, req_size, max)) ++ goto readit; ++ ++ /* ++ * standalone, small random read ++ * Read as is, and do not pollute the readahead state. ++ */ ++ return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); ++ ++initial_readahead: + ra->start = offset; + ra->size = get_init_ra_size(req_size, max); + ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; + + readit: ++ /* ++ * Will this read hit the readahead marker made by itself? ++ * If so, trigger the readahead marker hit now, and merge ++ * the resulted next readahead window into the current one. ++ */ ++ if (offset == ra->start && ra->size == ra->async_size) { ++ ra->async_size = get_next_ra_size(ra, max); ++ ra->size += ra->async_size; ++ } ++ + return ra_submit(ra, mapping, filp); + } + +--- linux.orig/lib/radix-tree.c ++++ linux/lib/radix-tree.c +@@ -666,6 +666,43 @@ unsigned long radix_tree_next_hole(struc + } + EXPORT_SYMBOL(radix_tree_next_hole); + ++/** ++ * radix_tree_prev_hole - find the prev hole (not-present entry) ++ * @root: tree root ++ * @index: index key ++ * @max_scan: maximum range to search ++ * ++ * Search backwards in the range [max(index-max_scan+1, 0), index] ++ * for the first hole. ++ * ++ * Returns: the index of the hole if found, otherwise returns an index ++ * outside of the set specified (in which case 'index - return >= max_scan' ++ * will be true). In rare cases of wrap-around, LONG_MAX will be returned. ++ * ++ * radix_tree_next_hole may be called under rcu_read_lock. However, like ++ * radix_tree_gang_lookup, this will not atomically search a snapshot of ++ * the tree at a single point in time. For example, if a hole is created ++ * at index 10, then subsequently a hole is created at index 5, ++ * radix_tree_prev_hole covering both indexes may return 5 if called under ++ * rcu_read_lock. ++ */ ++unsigned long radix_tree_prev_hole(struct radix_tree_root *root, ++ unsigned long index, unsigned long max_scan) ++{ ++ unsigned long i; ++ ++ for (i = 0; i < max_scan; i++) { ++ if (!radix_tree_lookup(root, index)) ++ break; ++ index--; ++ if (index == LONG_MAX) ++ break; ++ } ++ ++ return index; ++} ++EXPORT_SYMBOL(radix_tree_prev_hole); ++ + static unsigned int + __lookup(struct radix_tree_node *slot, void ***results, unsigned long index, + unsigned int max_items, unsigned long *next_index) +--- linux.orig/include/linux/radix-tree.h ++++ linux/include/linux/radix-tree.h +@@ -167,6 +167,8 @@ radix_tree_gang_lookup_slot(struct radix + unsigned long first_index, unsigned int max_items); + unsigned long radix_tree_next_hole(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan); ++unsigned long radix_tree_prev_hole(struct radix_tree_root *root, ++ unsigned long index, unsigned long max_scan); + int radix_tree_preload(gfp_t gfp_mask); + void radix_tree_init(void); + void *radix_tree_tag_set(struct radix_tree_root *root, diff --git a/scst/kernel/readahead-context-2.6.30.patch b/scst/kernel/readahead-context-2.6.30.patch new file mode 100644 index 000000000..18e80e5dc --- /dev/null +++ b/scst/kernel/readahead-context-2.6.30.patch @@ -0,0 +1,231 @@ +--- linux.orig/mm/readahead.c ++++ linux/mm/readahead.c +@@ -348,6 +348,59 @@ static unsigned long get_next_ra_size(st + */ + + /* ++ * Count contiguously cached pages from @offset-1 to @offset-@max, ++ * this count is a conservative estimation of ++ * - length of the sequential read sequence, or ++ * - thrashing threshold in memory tight systems ++ */ ++static pgoff_t count_history_pages(struct address_space *mapping, ++ struct file_ra_state *ra, ++ pgoff_t offset, unsigned long max) ++{ ++ pgoff_t head; ++ ++ rcu_read_lock(); ++ head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); ++ rcu_read_unlock(); ++ ++ return offset - 1 - head; ++} ++ ++/* ++ * page cache context based read-ahead ++ */ ++static int try_context_readahead(struct address_space *mapping, ++ struct file_ra_state *ra, ++ pgoff_t offset, ++ unsigned long req_size, ++ unsigned long max) ++{ ++ pgoff_t size; ++ ++ size = count_history_pages(mapping, ra, offset, max); ++ ++ /* ++ * no history pages: ++ * it could be a random read ++ */ ++ if (!size) ++ return 0; ++ ++ /* ++ * starts from beginning of file: ++ * it is a strong indication of long-run stream (or whole-file-read) ++ */ ++ if (size >= offset) ++ size *= 2; ++ ++ ra->start = offset; ++ ra->size = get_init_ra_size(size + req_size, max); ++ ra->async_size = ra->size; ++ ++ return 1; ++} ++ ++/* + * A minimal readahead algorithm for trivial sequential/random reads. + */ + static unsigned long +@@ -356,34 +409,26 @@ ondemand_readahead(struct address_space + bool hit_readahead_marker, pgoff_t offset, + unsigned long req_size) + { +- int max = ra->ra_pages; /* max readahead pages */ +- pgoff_t prev_offset; +- int sequential; ++ unsigned long max = max_sane_readahead(ra->ra_pages); ++ ++ /* ++ * start of file ++ */ ++ if (!offset) ++ goto initial_readahead; + + /* + * It's the expected callback offset, assume sequential access. + * Ramp up sizes, and push forward the readahead window. + */ +- if (offset && (offset == (ra->start + ra->size - ra->async_size) || +- offset == (ra->start + ra->size))) { ++ if ((offset == (ra->start + ra->size - ra->async_size) || ++ offset == (ra->start + ra->size))) { + ra->start += ra->size; + ra->size = get_next_ra_size(ra, max); + ra->async_size = ra->size; + goto readit; + } + +- prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; +- sequential = offset - prev_offset <= 1UL || req_size > max; +- +- /* +- * Standalone, small read. +- * Read as is, and do not pollute the readahead state. +- */ +- if (!hit_readahead_marker && !sequential) { +- return __do_page_cache_readahead(mapping, filp, +- offset, req_size, 0); +- } +- + /* + * Hit a marked page without valid readahead state. + * E.g. interleaved reads. +@@ -394,7 +439,7 @@ ondemand_readahead(struct address_space + pgoff_t start; + + rcu_read_lock(); +- start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); ++ start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); + rcu_read_unlock(); + + if (!start || start - offset > max) +@@ -402,23 +447,53 @@ ondemand_readahead(struct address_space + + ra->start = start; + ra->size = start - offset; /* old async_size */ ++ ra->size += req_size; + ra->size = get_next_ra_size(ra, max); + ra->async_size = ra->size; + goto readit; + } + + /* +- * It may be one of +- * - first read on start of file +- * - sequential cache miss +- * - oversize random read +- * Start readahead for it. ++ * oversize read ++ */ ++ if (req_size > max) ++ goto initial_readahead; ++ ++ /* ++ * sequential cache miss + */ ++ if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) ++ goto initial_readahead; ++ ++ /* ++ * Query the page cache and look for the traces(cached history pages) ++ * that a sequential stream would leave behind. ++ */ ++ if (try_context_readahead(mapping, ra, offset, req_size, max)) ++ goto readit; ++ ++ /* ++ * standalone, small random read ++ * Read as is, and do not pollute the readahead state. ++ */ ++ return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); ++ ++initial_readahead: + ra->start = offset; + ra->size = get_init_ra_size(req_size, max); + ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; + + readit: ++ /* ++ * Will this read hit the readahead marker made by itself? ++ * If so, trigger the readahead marker hit now, and merge ++ * the resulted next readahead window into the current one. ++ */ ++ if (offset == ra->start && ra->size == ra->async_size) { ++ ra->async_size = get_next_ra_size(ra, max); ++ ra->size += ra->async_size; ++ } ++ + return ra_submit(ra, mapping, filp); + } + +--- linux.orig/lib/radix-tree.c ++++ linux/lib/radix-tree.c +@@ -666,6 +666,43 @@ unsigned long radix_tree_next_hole(struc + } + EXPORT_SYMBOL(radix_tree_next_hole); + ++/** ++ * radix_tree_prev_hole - find the prev hole (not-present entry) ++ * @root: tree root ++ * @index: index key ++ * @max_scan: maximum range to search ++ * ++ * Search backwards in the range [max(index-max_scan+1, 0), index] ++ * for the first hole. ++ * ++ * Returns: the index of the hole if found, otherwise returns an index ++ * outside of the set specified (in which case 'index - return >= max_scan' ++ * will be true). In rare cases of wrap-around, LONG_MAX will be returned. ++ * ++ * radix_tree_next_hole may be called under rcu_read_lock. However, like ++ * radix_tree_gang_lookup, this will not atomically search a snapshot of ++ * the tree at a single point in time. For example, if a hole is created ++ * at index 10, then subsequently a hole is created at index 5, ++ * radix_tree_prev_hole covering both indexes may return 5 if called under ++ * rcu_read_lock. ++ */ ++unsigned long radix_tree_prev_hole(struct radix_tree_root *root, ++ unsigned long index, unsigned long max_scan) ++{ ++ unsigned long i; ++ ++ for (i = 0; i < max_scan; i++) { ++ if (!radix_tree_lookup(root, index)) ++ break; ++ index--; ++ if (index == LONG_MAX) ++ break; ++ } ++ ++ return index; ++} ++EXPORT_SYMBOL(radix_tree_prev_hole); ++ + static unsigned int + __lookup(struct radix_tree_node *slot, void ***results, unsigned long index, + unsigned int max_items, unsigned long *next_index) +--- linux.orig/include/linux/radix-tree.h ++++ linux/include/linux/radix-tree.h +@@ -167,6 +167,8 @@ radix_tree_gang_lookup_slot(struct radix + unsigned long first_index, unsigned int max_items); + unsigned long radix_tree_next_hole(struct radix_tree_root *root, + unsigned long index, unsigned long max_scan); ++unsigned long radix_tree_prev_hole(struct radix_tree_root *root, ++ unsigned long index, unsigned long max_scan); + int radix_tree_preload(gfp_t gfp_mask); + void radix_tree_init(void); + void *radix_tree_tag_set(struct radix_tree_root *root,