orangefs: implement orangefs_readahead

mm/readahead.c/read_pages was quite a bit different back
 when I put my open-coded readahead logic into orangefs_readpage.
 It seemed to work as designed then, it is a trainwreck now.
 
 This patch implements orangefs_readahead using new xarray
 and readahead_expand features that have just been pulled and
 removes all my open-coded readahead logic.
 
 This patch results in an extreme read performance improvement,
 these sample numbers are from my test VM:
 
 Here's an example of what's upstream in
 5.11.8-200.fc33.x86_64:
 
 30+0 records in
 30+0 records out
 125829120 bytes (126 MB, 120 MiB) copied, 5.77943 s, 21.8 MB/s
 
 And here's this version of orangefs_readahead on top of
 5.12.0-rc4:
 
 30+0 records in
 30+0 records out
 125829120 bytes (126 MB, 120 MiB) copied, 0.325919 s, 386 MB/s
 
 There are four xfstest regressions with this patch. David Howells
 and Matthew Wilcox have been helping me work with this code. One
 of the regressions has gone away with the most recent version of
 their code that I'm using. I hope this patch can be
 pulled even though there are still a few regressions, and that
 we can try to get them resolved during the RC period.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCAAdFiEEIGSFVdO6eop9nER2z0QOqevODb4FAmCPCUsACgkQz0QOqevO
 Db77DQ/7B8V7RPlQ8C6HJlSuCED67W9isCG5CdzGobVafBrirbUusanQJRhjrIZO
 Voy0NYsR/rsM3K1tNk9AE7rlbT4UQibeUXwFVcVjBvtyXBiTgjbROc2AP4pjxAWu
 erH2McMEbrYjgrevwR/PKxyD8wS6vTX2InnI4yvlkbfEz04u/KkTSu0oN4UCU/8u
 8/drWDTIgZz6wffb1RpMFsCP77tfVWIWlRlH39u9OTe4fhPMug8jN+uOBrfyYxdp
 snJWznyeSYCQ4q/KkPkjfSUTDmx3+E1WeSHMNviHfwENdbcUAojk2O9wepBwJhQn
 r0DFU2yM+132oRkWO1DF7If1FRfvcmHjE4bmlLBSg+xgKOKpdMCs7Nf+s1Sji+w/
 8xTAPWzdqBeW6z4nIncvZPtjtes3979mJ/Jm/f4GLonAQB6yPJcIzA8gl5EEgXI3
 20pAt2JNCgCHVhHQso5fkLINlpND/cwlbOEOjyrNXIoJJngGDRo9FQ/osGBaLv5i
 n3XWC41lYnX9nqJ2FuVLBuZ+Jv1k5XSQualpyGGVTFaYp/jZVbjUOgJk7QPNsWl7
 9cUZAMVdDW6y7z1aZ2bu5y7VFIkPe4nfZNqrgXX+YySq0uOTrQBegkQRp1pu3t8m
 P3P9lVqcrn/kw+FASZborq921Njw+YDHvZuYfrnbF7J0sUL0fu4=
 =09Vm
 -----END PGP SIGNATURE-----

Merge tag 'for-linus-5.13-ofs-1' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux

Pull orangefs updates from Mike Marshall:
 "orangefs: implement orangefs_readahead

  mm/readahead.c/read_pages was quite a bit different back when I put my
  open-coded readahead logic into orangefs_readpage. That logic seemed
  to work as designed back then, it is a trainwreck now.

  This implements orangefs_readahead using the new xarray and
  readahead_expand features and removes all my open-coded readahead
  logic.

  This results in an extreme read performance improvement, these sample
  numbers are from my test VM:

  Here's an example of what's upstream in
  5.11.8-200.fc33.x86_64:

     30+0 records in
     30+0 records out
     125829120 bytes (126 MB, 120 MiB) copied, 5.77943 s, 21.8 MB/s

  And here's this version of orangefs_readahead on top of 5.12.0-rc4:

     30+0 records in
     30+0 records out
     125829120 bytes (126 MB, 120 MiB) copied, 0.325919 s, 386 MB/s

  There are four xfstest regressions with this patch. David Howells and
  Matthew Wilcox have been helping me work with this code"

* tag 'for-linus-5.13-ofs-1' of git://git.kernel.org/pub/scm/linux/kernel/git/hubcap/linux:
  orangefs: leave files in the page cache for a few micro seconds at least
  Orangef: implement orangefs_readahead.
This commit is contained in:
Linus Torvalds 2021-05-02 14:13:46 -07:00
Родитель 27787ba3fa 211f9f2e05
Коммит 9ccce092fc
3 изменённых файлов: 54 добавлений и 104 удалений

Просмотреть файл

@ -248,21 +248,7 @@ populate_shared_memory:
* or it can pointers to struct page's * or it can pointers to struct page's
*/ */
/* copy_amount = new_op->downcall.resp.io.amt_complete;
* When reading, readahead_size will only be zero when
* we're doing O_DIRECT, otherwise we got here from
* orangefs_readpage.
*
* If we got here from orangefs_readpage we want to
* copy either a page or the whole file into the io
* vector, whichever is smaller.
*/
if (readahead_size)
copy_amount =
min(new_op->downcall.resp.io.amt_complete,
(__s64)PAGE_SIZE);
else
copy_amount = new_op->downcall.resp.io.amt_complete;
ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index, ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index,
copy_amount); copy_amount);
@ -283,19 +269,11 @@ populate_shared_memory:
out: out:
if (buffer_index >= 0) { if (buffer_index >= 0) {
if ((readahead_size) && (type == ORANGEFS_IO_READ)) { orangefs_bufmap_put(buffer_index);
/* readpage */ gossip_debug(GOSSIP_FILE_DEBUG,
*index_return = buffer_index; "%s(%pU): PUT buffer_index %d\n",
gossip_debug(GOSSIP_FILE_DEBUG, __func__, handle, buffer_index);
"%s: hold on to buffer_index :%d:\n", buffer_index = -1;
__func__, buffer_index);
} else {
/* O_DIRECT */
orangefs_bufmap_put(buffer_index);
gossip_debug(GOSSIP_FILE_DEBUG,
"%s(%pU): PUT buffer_index %d\n",
__func__, handle, buffer_index);
}
} }
op_release(new_op); op_release(new_op);
return ret; return ret;

Просмотреть файл

@ -245,6 +245,50 @@ static int orangefs_writepages(struct address_space *mapping,
static int orangefs_launder_page(struct page *); static int orangefs_launder_page(struct page *);
static void orangefs_readahead(struct readahead_control *rac)
{
loff_t offset;
struct iov_iter iter;
struct file *file = rac->file;
struct inode *inode = file->f_mapping->host;
struct xarray *i_pages;
struct page *page;
loff_t new_start = readahead_pos(rac);
int ret;
size_t new_len = 0;
loff_t bytes_remaining = inode->i_size - readahead_pos(rac);
loff_t pages_remaining = bytes_remaining / PAGE_SIZE;
if (pages_remaining >= 1024)
new_len = 4194304;
else if (pages_remaining > readahead_count(rac))
new_len = bytes_remaining;
if (new_len)
readahead_expand(rac, new_start, new_len);
offset = readahead_pos(rac);
i_pages = &file->f_mapping->i_pages;
iov_iter_xarray(&iter, READ, i_pages, offset, readahead_length(rac));
/* read in the pages. */
if ((ret = wait_for_direct_io(ORANGEFS_IO_READ, inode,
&offset, &iter, readahead_length(rac),
inode->i_size, NULL, NULL, file)) < 0)
gossip_debug(GOSSIP_FILE_DEBUG,
"%s: wait_for_direct_io failed. \n", __func__);
else
ret = 0;
/* clean up. */
while ((page = readahead_page(rac))) {
page_endio(page, false, ret);
put_page(page);
}
}
static int orangefs_readpage(struct file *file, struct page *page) static int orangefs_readpage(struct file *file, struct page *page)
{ {
struct inode *inode = page->mapping->host; struct inode *inode = page->mapping->host;
@ -252,44 +296,24 @@ static int orangefs_readpage(struct file *file, struct page *page)
struct bio_vec bv; struct bio_vec bv;
ssize_t ret; ssize_t ret;
loff_t off; /* offset into this page */ loff_t off; /* offset into this page */
pgoff_t index; /* which page */
struct page *next_page;
char *kaddr;
loff_t read_size;
int buffer_index = -1; /* orangefs shared memory slot */
int slot_index; /* index into slot */
int remaining;
/*
* Get up to this many bytes from Orangefs at a time and try
* to fill them into the page cache at once. Tests with dd made
* this seem like a reasonable static number, if there was
* interest perhaps this number could be made setable through
* sysfs...
*/
read_size = 524288;
if (PageDirty(page)) if (PageDirty(page))
orangefs_launder_page(page); orangefs_launder_page(page);
off = page_offset(page); off = page_offset(page);
index = off >> PAGE_SHIFT;
bv.bv_page = page; bv.bv_page = page;
bv.bv_len = PAGE_SIZE; bv.bv_len = PAGE_SIZE;
bv.bv_offset = 0; bv.bv_offset = 0;
iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE); iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE);
ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
read_size, inode->i_size, NULL, &buffer_index, file); PAGE_SIZE, inode->i_size, NULL, NULL, file);
remaining = ret;
/* this will only zero remaining unread portions of the page data */ /* this will only zero remaining unread portions of the page data */
iov_iter_zero(~0U, &iter); iov_iter_zero(~0U, &iter);
/* takes care of potential aliasing */ /* takes care of potential aliasing */
flush_dcache_page(page); flush_dcache_page(page);
if (ret < 0) { if (ret < 0) {
SetPageError(page); SetPageError(page);
unlock_page(page);
goto out;
} else { } else {
SetPageUptodate(page); SetPageUptodate(page);
if (PageError(page)) if (PageError(page))
@ -298,60 +322,7 @@ static int orangefs_readpage(struct file *file, struct page *page)
} }
/* unlock the page after the ->readpage() routine completes */ /* unlock the page after the ->readpage() routine completes */
unlock_page(page); unlock_page(page);
return ret;
if (remaining > PAGE_SIZE) {
slot_index = 0;
while ((remaining - PAGE_SIZE) >= PAGE_SIZE) {
remaining -= PAGE_SIZE;
/*
* It is an optimization to try and fill more than one
* page... by now we've already gotten the single
* page we were after, if stuff doesn't seem to
* be going our way at this point just return
* and hope for the best.
*
* If we look for pages and they're already there is
* one reason to give up, and if they're not there
* and we can't create them is another reason.
*/
index++;
slot_index++;
next_page = find_get_page(inode->i_mapping, index);
if (next_page) {
gossip_debug(GOSSIP_FILE_DEBUG,
"%s: found next page, quitting\n",
__func__);
put_page(next_page);
goto out;
}
next_page = find_or_create_page(inode->i_mapping,
index,
GFP_KERNEL);
/*
* I've never hit this, leave it as a printk for
* now so it will be obvious.
*/
if (!next_page) {
printk("%s: can't create next page, quitting\n",
__func__);
goto out;
}
kaddr = kmap_atomic(next_page);
orangefs_bufmap_page_fill(kaddr,
buffer_index,
slot_index);
kunmap_atomic(kaddr);
SetPageUptodate(next_page);
unlock_page(next_page);
put_page(next_page);
}
}
out:
if (buffer_index != -1)
orangefs_bufmap_put(buffer_index);
return ret;
} }
static int orangefs_write_begin(struct file *file, static int orangefs_write_begin(struct file *file,
@ -660,6 +631,7 @@ out:
/** ORANGEFS2 implementation of address space operations */ /** ORANGEFS2 implementation of address space operations */
static const struct address_space_operations orangefs_address_operations = { static const struct address_space_operations orangefs_address_operations = {
.writepage = orangefs_writepage, .writepage = orangefs_writepage,
.readahead = orangefs_readahead,
.readpage = orangefs_readpage, .readpage = orangefs_readpage,
.writepages = orangefs_writepages, .writepages = orangefs_writepages,
.set_page_dirty = __set_page_dirty_nobuffers, .set_page_dirty = __set_page_dirty_nobuffers,

Просмотреть файл

@ -31,7 +31,7 @@ static ulong module_parm_debug_mask;
__u64 orangefs_gossip_debug_mask; __u64 orangefs_gossip_debug_mask;
int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS; int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS; int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
int orangefs_cache_timeout_msecs = 50; int orangefs_cache_timeout_msecs = 500;
int orangefs_dcache_timeout_msecs = 50; int orangefs_dcache_timeout_msecs = 50;
int orangefs_getattr_timeout_msecs = 50; int orangefs_getattr_timeout_msecs = 50;