From fc2e2e9c43e3b3f5dec8a02b17ee3d6343d9783a Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 12 Dec 2017 13:46:30 -0500 Subject: [PATCH 01/23] orangefs: implement xattr cache This uses the same timeout as the getattr cache. This substantially increases performance when writing files with smaller buffer sizes. When writing, the size is (often) changed, which causes a call to notify_change which calls security_inode_need_killpriv which needs a getxattr. Caching it reduces traffic to the server. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/inode.c | 1 + fs/orangefs/orangefs-kernel.h | 11 ++++ fs/orangefs/super.c | 10 ++++ fs/orangefs/xattr.c | 106 +++++++++++++++++++++++++++++++++- 4 files changed, 127 insertions(+), 1 deletion(-) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index c3334eca18c7..b47765ea6870 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -364,6 +364,7 @@ static int orangefs_set_inode(struct inode *inode, void *data) struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data; ORANGEFS_I(inode)->refn.fs_id = ref->fs_id; ORANGEFS_I(inode)->refn.khandle = ref->khandle; + hash_init(ORANGEFS_I(inode)->xattr_cache); return 0; } diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 17b24ad6b264..eba9136207f9 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -51,6 +51,7 @@ #include #include #include +#include #include @@ -193,6 +194,8 @@ struct orangefs_inode_s { unsigned long getattr_time; u32 getattr_mask; + + DECLARE_HASHTABLE(xattr_cache, 4); }; /* per superblock private orangefs info */ @@ -217,6 +220,14 @@ struct orangefs_stats { unsigned long writes; }; +struct orangefs_cached_xattr { + struct hlist_node node; + char key[ORANGEFS_MAX_XATTR_NAMELEN]; + char val[ORANGEFS_MAX_XATTR_VALUELEN]; + ssize_t length; + unsigned long timeout; +}; + extern struct orangefs_stats orangefs_stats; /* diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index dfaee90d30bd..31db6ac73de1 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -10,6 +10,7 @@ #include "orangefs-bufmap.h" #include +#include /* a cache for orangefs-inode objects (i.e. orangefs inode private data) */ static struct kmem_cache *orangefs_inode_cache; @@ -128,6 +129,15 @@ static void orangefs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_cached_xattr *cx; + struct hlist_node *tmp; + int i; + + hash_for_each_safe(orangefs_inode->xattr_cache, i, tmp, cx, node) { + hlist_del(&cx->node); + kfree(cx); + } + kmem_cache_free(orangefs_inode_cache, orangefs_inode); } diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 03bcb871544d..bdc285aea360 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago + * Copyright 2018 Omnibond Systems, L.L.C. * * See COPYING in top-level directory. */ @@ -14,7 +15,7 @@ #include "orangefs-bufmap.h" #include #include - +#include #define SYSTEM_ORANGEFS_KEY "system.pvfs2." #define SYSTEM_ORANGEFS_KEY_LEN 13 @@ -50,6 +51,35 @@ static inline int convert_to_internal_xattr_flags(int setxattr_flags) return internal_flag; } +static unsigned int xattr_key(const char *key) +{ + unsigned int i = 0; + while (key) + i += *key++; + return i % 16; +} + +static struct orangefs_cached_xattr *find_cached_xattr(struct inode *inode, + const char *key) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_cached_xattr *cx; + struct hlist_head *h; + struct hlist_node *tmp; + h = &orangefs_inode->xattr_cache[xattr_key(key)]; + if (hlist_empty(h)) + return NULL; + hlist_for_each_entry_safe(cx, tmp, h, node) { +/* if (!time_before(jiffies, cx->timeout)) { + hlist_del(&cx->node); + kfree(cx); + continue; + }*/ + if (!strcmp(cx->key, key)) + return cx; + } + return NULL; +} /* * Tries to get a specified key's attributes of a given @@ -65,6 +95,7 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op = NULL; + struct orangefs_cached_xattr *cx; ssize_t ret = -ENOMEM; ssize_t length = 0; int fsuid; @@ -93,6 +124,27 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, down_read(&orangefs_inode->xattr_sem); + cx = find_cached_xattr(inode, name); + if (cx && time_before(jiffies, cx->timeout)) { + if (cx->length == -1) { + ret = -ENODATA; + goto out_unlock; + } else { + if (size == 0) { + ret = cx->length; + goto out_unlock; + } + if (cx->length > size) { + ret = -ERANGE; + goto out_unlock; + } + memcpy(buffer, cx->val, cx->length); + memset(buffer + cx->length, 0, size - cx->length); + ret = cx->length; + goto out_unlock; + } + } + new_op = op_alloc(ORANGEFS_VFS_OP_GETXATTR); if (!new_op) goto out_unlock; @@ -117,6 +169,15 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, " does not exist!\n", get_khandle_from_ino(inode), (char *)new_op->upcall.req.getxattr.key); + cx = kmalloc(sizeof *cx, GFP_KERNEL); + if (cx) { + strcpy(cx->key, name); + cx->length = -1; + cx->timeout = jiffies + + orangefs_getattr_timeout_msecs*HZ/1000; + hash_add(orangefs_inode->xattr_cache, &cx->node, + xattr_key(cx->key)); + } } goto out_release_op; } @@ -156,6 +217,23 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, ret = length; + if (cx) { + strcpy(cx->key, name); + memcpy(cx->val, buffer, length); + cx->length = length; + cx->timeout = jiffies + HZ; + } else { + cx = kmalloc(sizeof *cx, GFP_KERNEL); + if (cx) { + strcpy(cx->key, name); + memcpy(cx->val, buffer, length); + cx->length = length; + cx->timeout = jiffies + HZ; + hash_add(orangefs_inode->xattr_cache, &cx->node, + xattr_key(cx->key)); + } + } + out_release_op: op_release(new_op); out_unlock: @@ -168,6 +246,9 @@ static int orangefs_inode_removexattr(struct inode *inode, const char *name, { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op = NULL; + struct orangefs_cached_xattr *cx; + struct hlist_head *h; + struct hlist_node *tmp; int ret = -ENOMEM; if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) @@ -209,6 +290,16 @@ static int orangefs_inode_removexattr(struct inode *inode, const char *name, "orangefs_inode_removexattr: returning %d\n", ret); op_release(new_op); + + h = &orangefs_inode->xattr_cache[xattr_key(name)]; + hlist_for_each_entry_safe(cx, tmp, h, node) { + if (!strcmp(cx->key, name)) { + hlist_del(&cx->node); + kfree(cx); + break; + } + } + out_unlock: up_write(&orangefs_inode->xattr_sem); return ret; @@ -226,6 +317,9 @@ int orangefs_inode_setxattr(struct inode *inode, const char *name, struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op; int internal_flag = 0; + struct orangefs_cached_xattr *cx; + struct hlist_head *h; + struct hlist_node *tmp; int ret = -ENOMEM; gossip_debug(GOSSIP_XATTR_DEBUG, @@ -287,6 +381,16 @@ int orangefs_inode_setxattr(struct inode *inode, const char *name, /* when request is serviced properly, free req op struct */ op_release(new_op); + + h = &orangefs_inode->xattr_cache[xattr_key(name)]; + hlist_for_each_entry_safe(cx, tmp, h, node) { + if (!strcmp(cx->key, name)) { + hlist_del(&cx->node); + kfree(cx); + break; + } + } + out_unlock: up_write(&orangefs_inode->xattr_sem); return ret; From 66d5477d7002aeee206108b43cde12d12c3c9d5b Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Wed, 1 Aug 2018 18:12:34 +0000 Subject: [PATCH 02/23] orangefs: do not invalidate attributes on inode create When an inode is created, we fetch attributes from the server. There is no need to turn around and invalidate them. No need to initialize attributes after the getattr either. Either it'll be exactly the same, or it'll be something else and wrong. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/inode.c | 6 ------ fs/orangefs/namei.c | 6 ------ 2 files changed, 12 deletions(-) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index b47765ea6870..a18205dbd27e 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -460,12 +460,6 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, goto out_iput; orangefs_init_iops(inode); - - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - inode->i_size = PAGE_SIZE; inode->i_rdev = dev; error = insert_inode_locked4(inode, hash, orangefs_test_inode, ref); diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index c8676c996249..87584d79ca7a 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -76,8 +76,6 @@ static int orangefs_create(struct inode *dir, d_instantiate_new(dentry, inode); orangefs_set_timeout(dentry); - ORANGEFS_I(inode)->getattr_time = jiffies - 1; - ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; gossip_debug(GOSSIP_NAME_DEBUG, "%s: dentry instantiated for %pd\n", @@ -291,8 +289,6 @@ static int orangefs_symlink(struct inode *dir, d_instantiate_new(dentry, inode); orangefs_set_timeout(dentry); - ORANGEFS_I(inode)->getattr_time = jiffies - 1; - ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; gossip_debug(GOSSIP_NAME_DEBUG, "Inode (Symlink) %pU -> %pd\n", @@ -360,8 +356,6 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode d_instantiate_new(dentry, inode); orangefs_set_timeout(dentry); - ORANGEFS_I(inode)->getattr_time = jiffies - 1; - ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; gossip_debug(GOSSIP_NAME_DEBUG, "Inode (Directory) %pU -> %pd\n", From 8b60785c1d7c63415c32bf64dabc686b9045ce7d Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Wed, 7 Feb 2018 18:44:50 +0000 Subject: [PATCH 03/23] orangefs: simplify orangefs_inode_getattr interface No need to store the received mask. It is either STATX_BASIC_STATS or STATX_BASIC_STATS & ~STATX_SIZE. If STATX_SIZE is requested, the cache is bypassed anyway, so the cached mask is unnecessary to decide whether to do a real getattr. This is a change. Previously a getattr would want size and use the cached size. All of the in-kernel callers that wanted size did not want a cached size. Now a getattr cannot use the cached size if it wants size at all. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 17 ++++++++--------- fs/orangefs/inode.c | 11 ++++++----- fs/orangefs/orangefs-kernel.h | 7 ++++--- fs/orangefs/orangefs-utils.c | 31 ++++++++++--------------------- 4 files changed, 28 insertions(+), 38 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index b094d3d79354..b0688ea894a4 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -420,8 +420,8 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite /* Make sure generic_write_checks sees an up to date inode size. */ if (file->f_flags & O_APPEND) { - rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1, - STATX_SIZE); + rc = orangefs_inode_getattr(file->f_mapping->host, + ORANGEFS_GETATTR_SIZE); if (rc == -ESTALE) rc = -EIO; if (rc) { @@ -528,14 +528,13 @@ static vm_fault_t orangefs_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; int ret; - - ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1, - STATX_SIZE); + ret = orangefs_inode_getattr(file->f_mapping->host, + ORANGEFS_GETATTR_SIZE); if (ret == -ESTALE) ret = -EIO; if (ret) { - gossip_err("%s: orangefs_inode_getattr failed, ret:%d:.\n", - __func__, ret); + gossip_err("%s: orangefs_inode_getattr failed, " + "ret:%d:.\n", __func__, ret); return VM_FAULT_SIGBUS; } return filemap_fault(vmf); @@ -656,8 +655,8 @@ static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin) * NOTE: We are only interested in file size here, * so we set mask accordingly. */ - ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1, - STATX_SIZE); + ret = orangefs_inode_getattr(file->f_mapping->host, + ORANGEFS_GETATTR_SIZE); if (ret == -ESTALE) ret = -EIO; if (ret) { diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index a18205dbd27e..152c3683d881 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -162,7 +162,7 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) iattr->ia_size); /* Ensure that we have a up to date size, so we know if it changed. */ - ret = orangefs_inode_getattr(inode, 0, 1, STATX_SIZE); + ret = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_SIZE); if (ret == -ESTALE) ret = -EIO; if (ret) { @@ -256,7 +256,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, "orangefs_getattr: called on %pd\n", path->dentry); - ret = orangefs_inode_getattr(inode, 0, 0, request_mask); + ret = orangefs_inode_getattr(inode, + request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0); if (ret == 0) { generic_fillattr(inode, stat); @@ -284,7 +285,7 @@ int orangefs_permission(struct inode *inode, int mask) gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__); /* Make sure the permission (and other common attrs) are up to date. */ - ret = orangefs_inode_getattr(inode, 0, 0, STATX_MODE); + ret = orangefs_inode_getattr(inode, 0); if (ret < 0) return ret; @@ -410,7 +411,7 @@ struct inode *orangefs_iget(struct super_block *sb, if (!(inode->i_state & I_NEW)) return inode; - error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL); + error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW); if (error) { iget_failed(inode); return ERR_PTR(error); @@ -455,7 +456,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, orangefs_set_inode(inode, ref); inode->i_ino = hash; /* needed for stat etc */ - error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL); + error = orangefs_inode_getattr(inode, ORANGEFS_GETATTR_NEW); if (error) goto out_iput; diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index eba9136207f9..4f0cf14c18f6 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -193,7 +193,6 @@ struct orangefs_inode_s { sector_t last_failed_block_index_read; unsigned long getattr_time; - u32 getattr_mask; DECLARE_HASHTABLE(xattr_cache, 4); }; @@ -397,8 +396,10 @@ int orangefs_inode_setxattr(struct inode *inode, size_t size, int flags); -int orangefs_inode_getattr(struct inode *inode, int new, int bypass, - u32 request_mask); +#define ORANGEFS_GETATTR_NEW 1 +#define ORANGEFS_GETATTR_SIZE 2 + +int orangefs_inode_getattr(struct inode *, int); int orangefs_inode_check_changed(struct inode *inode); diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 804c8a261e4b..76f18a3494c7 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago + * Copyright 2018 Omnibond Systems, L.L.C. * * See COPYING in top-level directory. */ @@ -272,8 +273,7 @@ static int orangefs_inode_is_stale(struct inode *inode, return 0; } -int orangefs_inode_getattr(struct inode *inode, int new, int bypass, - u32 request_mask) +int orangefs_inode_getattr(struct inode *inode, int flags) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op; @@ -283,16 +283,9 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass, gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__, get_khandle_from_ino(inode)); - if (!new && !bypass) { - /* - * Must have all the attributes in the mask and be within cache - * time. - */ - if ((request_mask & orangefs_inode->getattr_mask) == - request_mask && - time_before(jiffies, orangefs_inode->getattr_time)) - return 0; - } + /* Must have all the attributes in the mask and be within cache time. */ + if (!flags && time_before(jiffies, orangefs_inode->getattr_time)) + return 0; new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR); if (!new_op) @@ -302,7 +295,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass, * Size is the hardest attribute to get. The incremental cost of any * other attribute is essentially zero. */ - if (request_mask & STATX_SIZE || new) + if (flags) new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT; else new_op->upcall.req.getattr.mask = @@ -313,7 +306,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass, if (ret != 0) goto out; - if (!new) { + if (!(flags & ORANGEFS_GETATTR_NEW)) { ret = orangefs_inode_is_stale(inode, &new_op->downcall.resp.getattr.attributes, new_op->downcall.resp.getattr.link_target); @@ -329,7 +322,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass, case S_IFREG: inode->i_flags = orangefs_inode_flags(&new_op-> downcall.resp.getattr.attributes); - if (request_mask & STATX_SIZE || new) { + if (flags) { inode_size = (loff_t)new_op-> downcall.resp.getattr.attributes.size; inode->i_size = inode_size; @@ -343,7 +336,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass, } break; case S_IFDIR: - if (request_mask & STATX_SIZE || new) { + if (flags) { inode->i_size = PAGE_SIZE; spin_lock(&inode->i_lock); inode_set_bytes(inode, inode->i_size); @@ -352,7 +345,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass, set_nlink(inode, 1); break; case S_IFLNK: - if (new) { + if (flags & ORANGEFS_GETATTR_NEW) { inode->i_size = (loff_t)strlen(new_op-> downcall.resp.getattr.link_target); ret = strscpy(orangefs_inode->link_target, @@ -393,10 +386,6 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass, orangefs_inode->getattr_time = jiffies + orangefs_getattr_timeout_msecs*HZ/1000; - if (request_mask & STATX_SIZE || new) - orangefs_inode->getattr_mask = STATX_BASIC_STATS; - else - orangefs_inode->getattr_mask = STATX_BASIC_STATS & ~STATX_SIZE; ret = 0; out: op_release(new_op); From 5e7f1d433804450cdb5ba478d26742963e06b1bc Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Mon, 12 Feb 2018 15:49:24 +0000 Subject: [PATCH 04/23] orangefs: update attributes rather than relying on server This should be a no-op now, but once inode writeback works, it'll be necessary to have the correct attribute in the dirty inode. Previously the attribute fetch timeout was marked invalid and the server provided the updated attribute. When the inode is dirty, the server cannot be consulted since it does not yet know the pending setattr. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 10 ++-------- fs/orangefs/namei.c | 7 ++++++- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index b0688ea894a4..a9e69c56d2fb 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -327,14 +327,8 @@ out: file_accessed(file); } else { file_update_time(file); - /* - * Must invalidate to ensure write loop doesn't - * prevent kernel from reading updated - * attribute. Size probably changed because of - * the write, and other clients could update - * any other attribute. - */ - orangefs_inode->getattr_time = jiffies - 1; + if (*offset > i_size_read(inode)) + i_size_write(inode, *offset); } } diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 87584d79ca7a..140314b76e10 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -383,6 +383,7 @@ static int orangefs_rename(struct inode *old_dir, unsigned int flags) { struct orangefs_kernel_op_s *new_op; + struct iattr iattr; int ret; if (flags) @@ -392,7 +393,11 @@ static int orangefs_rename(struct inode *old_dir, "orangefs_rename: called (%pd2 => %pd2) ct=%d\n", old_dentry, new_dentry, d_count(new_dentry)); - ORANGEFS_I(new_dentry->d_parent->d_inode)->getattr_time = jiffies - 1; + new_dir->i_mtime = new_dir->i_ctime = current_time(new_dir); + memset(&iattr, 0, sizeof iattr); + iattr.ia_valid |= ATTR_MTIME; + orangefs_inode_setattr(new_dir, &iattr); + mark_inode_dirty_sync(new_dir); new_op = op_alloc(ORANGEFS_VFS_OP_RENAME); if (!new_op) From 5e4f606e26d6a1df6784f5833ea258047ac93254 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Mon, 12 Feb 2018 17:04:57 +0000 Subject: [PATCH 05/23] orangefs: hold i_lock during inode_getattr This should be a no-op now. When inode writeback works, this will prevent a getattr from overwriting inode data while an inode is transitioning to dirty. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/inode.c | 4 ++-- fs/orangefs/orangefs-utils.c | 33 +++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 152c3683d881..222ef7be0c7c 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -253,8 +253,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, struct inode *inode = path->dentry->d_inode; gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_getattr: called on %pd\n", - path->dentry); + "orangefs_getattr: called on %pd mask %u\n", + path->dentry, request_mask); ret = orangefs_inode_getattr(inode, request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0); diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 76f18a3494c7..d44cbe96719a 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -280,12 +280,17 @@ int orangefs_inode_getattr(struct inode *inode, int flags) loff_t inode_size; int ret, type; - gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__, - get_khandle_from_ino(inode)); + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU flags %d\n", + __func__, get_khandle_from_ino(inode), flags); + spin_lock(&inode->i_lock); /* Must have all the attributes in the mask and be within cache time. */ - if (!flags && time_before(jiffies, orangefs_inode->getattr_time)) + if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) || + inode->i_state & I_DIRTY) { + spin_unlock(&inode->i_lock); return 0; + } + spin_unlock(&inode->i_lock); new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR); if (!new_op) @@ -306,13 +311,23 @@ int orangefs_inode_getattr(struct inode *inode, int flags) if (ret != 0) goto out; + spin_lock(&inode->i_lock); + /* Must have all the attributes in the mask and be within cache time. */ + if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) || + inode->i_state & I_DIRTY) { + gossip_debug(GOSSIP_UTILS_DEBUG, "%s: in cache or dirty\n", + __func__); + ret = 0; + goto out_unlock; + } + if (!(flags & ORANGEFS_GETATTR_NEW)) { ret = orangefs_inode_is_stale(inode, &new_op->downcall.resp.getattr.attributes, new_op->downcall.resp.getattr.link_target); if (ret) { ret = -ESTALE; - goto out; + goto out_unlock; } } @@ -328,19 +343,15 @@ int orangefs_inode_getattr(struct inode *inode, int flags) inode->i_size = inode_size; inode->i_blkbits = ffs(new_op->downcall.resp.getattr. attributes.blksize); - spin_lock(&inode->i_lock); inode->i_bytes = inode_size; inode->i_blocks = (inode_size + 512 - inode_size % 512)/512; - spin_unlock(&inode->i_lock); } break; case S_IFDIR: if (flags) { inode->i_size = PAGE_SIZE; - spin_lock(&inode->i_lock); inode_set_bytes(inode, inode->i_size); - spin_unlock(&inode->i_lock); } set_nlink(inode, 1); break; @@ -353,7 +364,7 @@ int orangefs_inode_getattr(struct inode *inode, int flags) ORANGEFS_NAME_MAX); if (ret == -E2BIG) { ret = -EIO; - goto out; + goto out_unlock; } inode->i_link = orangefs_inode->link_target; } @@ -363,7 +374,7 @@ int orangefs_inode_getattr(struct inode *inode, int flags) /* XXX: ESTALE? This is what is done if it is not new. */ orangefs_make_bad_inode(inode); ret = -ESTALE; - goto out; + goto out_unlock; } inode->i_uid = make_kuid(&init_user_ns, new_op-> @@ -387,6 +398,8 @@ int orangefs_inode_getattr(struct inode *inode, int flags) orangefs_inode->getattr_time = jiffies + orangefs_getattr_timeout_msecs*HZ/1000; ret = 0; +out_unlock: + spin_unlock(&inode->i_lock); out: op_release(new_op); return ret; From f2d34c738cbf21b0d24982693b71fd9f9f52ea4b Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Mon, 12 Feb 2018 20:28:42 +0000 Subject: [PATCH 06/23] orangefs: set up and use backing_dev_info Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/super.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 31db6ac73de1..ef3388c90ff7 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -407,15 +407,11 @@ static int orangefs_fill_sb(struct super_block *sb, struct orangefs_fs_mount_response *fs_mount, void *data, int silent) { - int ret = -EINVAL; - struct inode *root = NULL; - struct dentry *root_dentry = NULL; + int ret; + struct inode *root; + struct dentry *root_dentry; struct orangefs_object_kref root_object; - /* alloc and init our private orangefs sb info */ - sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL); - if (!ORANGEFS_SB(sb)) - return -ENOMEM; ORANGEFS_SB(sb)->sb = sb; ORANGEFS_SB(sb)->root_khandle = fs_mount->root_khandle; @@ -438,6 +434,10 @@ static int orangefs_fill_sb(struct super_block *sb, sb->s_blocksize_bits = PAGE_SHIFT; sb->s_maxbytes = MAX_LFS_FILESIZE; + ret = super_setup_bdi(sb); + if (ret) + return ret; + root_object.khandle = ORANGEFS_SB(sb)->root_khandle; root_object.fs_id = ORANGEFS_SB(sb)->fs_id; gossip_debug(GOSSIP_SUPER_DEBUG, @@ -516,6 +516,13 @@ struct dentry *orangefs_mount(struct file_system_type *fst, goto free_op; } + /* alloc and init our private orangefs sb info */ + sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL); + if (!ORANGEFS_SB(sb)) { + d = ERR_PTR(-ENOMEM); + goto free_op; + } + ret = orangefs_fill_sb(sb, &new_op->downcall.resp.fs_mount, data, flags & SB_SILENT ? 1 : 0); From df2d7337b570c34e051a2412f716743277ccf9c8 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Mon, 12 Feb 2018 20:29:37 +0000 Subject: [PATCH 07/23] orangefs: let setattr write to cached inode This is a fairly big change, but ultimately it's not a lot of code. Implement write_inode and then avoid the call to orangefs_inode_setattr within orangefs_setattr. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/inode.c | 11 +++-------- fs/orangefs/super.c | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 222ef7be0c7c..2e630c1f7ae2 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -207,8 +207,8 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) */ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) { - int ret = -EINVAL; struct inode *inode = dentry->d_inode; + int ret; gossip_debug(GOSSIP_INODE_DEBUG, "%s: called on %pd\n", @@ -228,16 +228,11 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) setattr_copy(inode, iattr); mark_inode_dirty(inode); - ret = orangefs_inode_setattr(inode, iattr); - gossip_debug(GOSSIP_INODE_DEBUG, - "%s: orangefs_inode_setattr returned %d\n", - __func__, - ret); - - if (!ret && (iattr->ia_valid & ATTR_MODE)) + if (iattr->ia_valid & ATTR_MODE) /* change mod on a file that has ACLs */ ret = posix_acl_chmod(inode, inode->i_mode); + ret = 0; out: gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret); return ret; diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index ef3388c90ff7..f27da3bbafac 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -152,6 +152,22 @@ static void orangefs_destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, orangefs_i_callback); } +static int orangefs_write_inode(struct inode *inode, + struct writeback_control *wbc) +{ + struct iattr iattr; + gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_write_inode\n"); + iattr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_ATIME | + ATTR_ATIME_SET | ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME; + iattr.ia_mode = inode->i_mode; + iattr.ia_uid = inode->i_uid; + iattr.ia_gid = inode->i_gid; + iattr.ia_atime = inode->i_atime; + iattr.ia_mtime = inode->i_mtime; + iattr.ia_ctime = inode->i_ctime; + return orangefs_inode_setattr(inode, &iattr); +} + /* * NOTE: information filled in here is typically reflected in the * output of the system command 'df' @@ -310,6 +326,7 @@ void fsid_key_table_finalize(void) static const struct super_operations orangefs_s_ops = { .alloc_inode = orangefs_alloc_inode, .destroy_inode = orangefs_destroy_inode, + .write_inode = orangefs_write_inode, .drop_inode = generic_delete_inode, .statfs = orangefs_statfs, .remount_fs = orangefs_remount_fs, From afd9fb2a31797b4c787034294a4062df0c19c37e Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 13 Feb 2018 20:13:46 +0000 Subject: [PATCH 08/23] orangefs: reorganize setattr functions to track attribute changes OrangeFS accepts a mask indicating which attributes were changed. The kernel must not set any bits except those that were actually changed. The kernel must set the uid/gid of the request to the actual uid/gid responsible for the change. Code path for notify_change initiated setattrs is orangefs_setattr(dentry, iattr) -> __orangefs_setattr(inode, iattr) In kernel changes are initiated by calling __orangefs_setattr. Code path for writeback is orangefs_write_inode -> orangefs_inode_setattr attr_valid and attr_uid and attr_gid change together under i_lock. I_DIRTY changes separately. __orangefs_setattr lock if needs to be cleaned first, unlock and retry set attr_valid copy data in unlock mark_inode_dirty orangefs_inode_setattr lock copy attributes out unlock clear getattr_time # __writeback_single_inode clears dirty orangefs_inode_getattr # possible to get here with attr_valid set and not dirty lock if getattr_time ok or attr_valid set, unlock and return unlock do server operation # another thread may getattr or setattr, so check for that lock if getattr_time ok or attr_valid, unlock and return else, copy in update getattr_time unlock Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/acl.c | 4 +- fs/orangefs/inode.c | 76 ++++++++++++++++++----- fs/orangefs/namei.c | 35 +++++------ fs/orangefs/orangefs-kernel.h | 8 ++- fs/orangefs/orangefs-utils.c | 114 +++++++++++++--------------------- fs/orangefs/super.c | 11 +--- 6 files changed, 129 insertions(+), 119 deletions(-) diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 72d2ff17d27b..eced272a3c57 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -142,7 +142,7 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) rc = __orangefs_set_acl(inode, acl, type); } else { iattr.ia_valid = ATTR_MODE; - rc = orangefs_inode_setattr(inode, &iattr); + rc = __orangefs_setattr(inode, &iattr); } return rc; @@ -185,7 +185,7 @@ int orangefs_init_acl(struct inode *inode, struct inode *dir) inode->i_mode = mode; iattr.ia_mode = mode; iattr.ia_valid |= ATTR_MODE; - orangefs_inode_setattr(inode, &iattr); + __orangefs_setattr(inode, &iattr); } return error; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 2e630c1f7ae2..2708bf8af9cf 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago + * Copyright 2018 Omnibond Systems, L.L.C. * * See COPYING in top-level directory. */ @@ -202,22 +203,31 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) return ret; } -/* - * Change attributes of an object referenced by dentry. - */ -int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) +int __orangefs_setattr(struct inode *inode, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; int ret; - gossip_debug(GOSSIP_INODE_DEBUG, - "%s: called on %pd\n", - __func__, - dentry); - - ret = setattr_prepare(dentry, iattr); - if (ret) - goto out; + if (iattr->ia_valid & ATTR_MODE) { + if (iattr->ia_mode & (S_ISVTX)) { + if (is_root_handle(inode)) { + /* + * allow sticky bit to be set on root (since + * it shows up that way by default anyhow), + * but don't show it to the server + */ + iattr->ia_mode -= S_ISVTX; + } else { + gossip_debug(GOSSIP_UTILS_DEBUG, + "User attempted to set sticky bit on non-root directory; returning EINVAL.\n"); + return -EINVAL; + } + } + if (iattr->ia_mode & (S_ISUID)) { + gossip_debug(GOSSIP_UTILS_DEBUG, + "Attempting to set setuid bit (not supported); returning EINVAL.\n"); + return -EINVAL; + } + } if (iattr->ia_valid & ATTR_SIZE) { ret = orangefs_setattr_size(inode, iattr); @@ -225,7 +235,24 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) goto out; } +again: + spin_lock(&inode->i_lock); + if (ORANGEFS_I(inode)->attr_valid) { + if (uid_eq(ORANGEFS_I(inode)->attr_uid, current_fsuid()) && + gid_eq(ORANGEFS_I(inode)->attr_gid, current_fsgid())) { + ORANGEFS_I(inode)->attr_valid = iattr->ia_valid; + } else { + spin_unlock(&inode->i_lock); + write_inode_now(inode, 1); + goto again; + } + } else { + ORANGEFS_I(inode)->attr_valid = iattr->ia_valid; + ORANGEFS_I(inode)->attr_uid = current_fsuid(); + ORANGEFS_I(inode)->attr_gid = current_fsgid(); + } setattr_copy(inode, iattr); + spin_unlock(&inode->i_lock); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) @@ -234,7 +261,25 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) ret = 0; out: - gossip_debug(GOSSIP_INODE_DEBUG, "%s: ret:%d:\n", __func__, ret); + return ret; +} + +/* + * Change attributes of an object referenced by dentry. + */ +int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) +{ + int ret; + gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n", + dentry); + ret = setattr_prepare(dentry, iattr); + if (ret) + goto out; + ret = __orangefs_setattr(d_inode(dentry), iattr); + sync_inode_metadata(d_inode(dentry), 1); +out: + gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_setattr: returning %d\n", + ret); return ret; } @@ -300,7 +345,7 @@ int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags iattr.ia_valid |= ATTR_CTIME; if (flags & S_MTIME) iattr.ia_valid |= ATTR_MTIME; - return orangefs_inode_setattr(inode, &iattr); + return __orangefs_setattr(inode, &iattr); } /* ORANGEFS2 implementation of VFS inode operations for files */ @@ -360,6 +405,7 @@ static int orangefs_set_inode(struct inode *inode, void *data) struct orangefs_object_kref *ref = (struct orangefs_object_kref *) data; ORANGEFS_I(inode)->refn.fs_id = ref->fs_id; ORANGEFS_I(inode)->refn.khandle = ref->khandle; + ORANGEFS_I(inode)->attr_valid = 0; hash_init(ORANGEFS_I(inode)->xattr_cache); return 0; } diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 140314b76e10..1dd710e5f376 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -82,11 +82,10 @@ static int orangefs_create(struct inode *dir, __func__, dentry); - dir->i_mtime = dir->i_ctime = current_time(dir); memset(&iattr, 0, sizeof iattr); - iattr.ia_valid |= ATTR_MTIME; - orangefs_inode_setattr(dir, &iattr); - mark_inode_dirty_sync(dir); + iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME; + iattr.ia_mtime = iattr.ia_ctime = current_time(dir); + __orangefs_setattr(dir, &iattr); ret = 0; out: op_release(new_op); @@ -208,11 +207,10 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry) if (!ret) { drop_nlink(inode); - dir->i_mtime = dir->i_ctime = current_time(dir); memset(&iattr, 0, sizeof iattr); - iattr.ia_valid |= ATTR_MTIME; - orangefs_inode_setattr(dir, &iattr); - mark_inode_dirty_sync(dir); + iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME; + iattr.ia_mtime = iattr.ia_ctime = current_time(dir); + __orangefs_setattr(dir, &iattr); } return ret; } @@ -295,11 +293,10 @@ static int orangefs_symlink(struct inode *dir, get_khandle_from_ino(inode), dentry); - dir->i_mtime = dir->i_ctime = current_time(dir); memset(&iattr, 0, sizeof iattr); - iattr.ia_valid |= ATTR_MTIME; - orangefs_inode_setattr(dir, &iattr); - mark_inode_dirty_sync(dir); + iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME; + iattr.ia_mtime = iattr.ia_ctime = current_time(dir); + __orangefs_setattr(dir, &iattr); ret = 0; out: op_release(new_op); @@ -366,11 +363,10 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode * NOTE: we have no good way to keep nlink consistent for directories * across clients; keep constant at 1. */ - dir->i_mtime = dir->i_ctime = current_time(dir); memset(&iattr, 0, sizeof iattr); - iattr.ia_valid |= ATTR_MTIME; - orangefs_inode_setattr(dir, &iattr); - mark_inode_dirty_sync(dir); + iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME; + iattr.ia_mtime = iattr.ia_ctime = current_time(dir); + __orangefs_setattr(dir, &iattr); out: op_release(new_op); return ret; @@ -393,11 +389,10 @@ static int orangefs_rename(struct inode *old_dir, "orangefs_rename: called (%pd2 => %pd2) ct=%d\n", old_dentry, new_dentry, d_count(new_dentry)); - new_dir->i_mtime = new_dir->i_ctime = current_time(new_dir); memset(&iattr, 0, sizeof iattr); - iattr.ia_valid |= ATTR_MTIME; - orangefs_inode_setattr(new_dir, &iattr); - mark_inode_dirty_sync(new_dir); + iattr.ia_valid |= ATTR_MTIME | ATTR_CTIME; + iattr.ia_mtime = iattr.ia_ctime = current_time(new_dir); + __orangefs_setattr(new_dir, &iattr); new_op = op_alloc(ORANGEFS_VFS_OP_RENAME); if (!new_op) diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 4f0cf14c18f6..a74d9e8c5f9e 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -193,6 +193,9 @@ struct orangefs_inode_s { sector_t last_failed_block_index_read; unsigned long getattr_time; + int attr_valid; + kuid_t attr_uid; + kgid_t attr_gid; DECLARE_HASHTABLE(xattr_cache, 4); }; @@ -345,7 +348,8 @@ struct inode *orangefs_new_inode(struct super_block *sb, dev_t dev, struct orangefs_object_kref *ref); -int orangefs_setattr(struct dentry *dentry, struct iattr *iattr); +int __orangefs_setattr(struct inode *, struct iattr *); +int orangefs_setattr(struct dentry *, struct iattr *); int orangefs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); @@ -403,7 +407,7 @@ int orangefs_inode_getattr(struct inode *, int); int orangefs_inode_check_changed(struct inode *inode); -int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr); +int orangefs_inode_setattr(struct inode *inode); bool orangefs_cancel_op_in_progress(struct orangefs_kernel_op_s *op); diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index d44cbe96719a..a4fac527f85d 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -136,51 +136,37 @@ static int orangefs_inode_perms(struct ORANGEFS_sys_attr_s *attrs) * NOTE: in kernel land, we never use the sys_attr->link_target for * anything, so don't bother copying it into the sys_attr object here. */ -static inline int copy_attributes_from_inode(struct inode *inode, - struct ORANGEFS_sys_attr_s *attrs, - struct iattr *iattr) +static inline void copy_attributes_from_inode(struct inode *inode, + struct ORANGEFS_sys_attr_s *attrs) { - umode_t tmp_mode; - - if (!iattr || !inode || !attrs) { - gossip_err("NULL iattr (%p), inode (%p), attrs (%p) " - "in copy_attributes_from_inode!\n", - iattr, - inode, - attrs); - return -EINVAL; - } - /* - * We need to be careful to only copy the attributes out of the - * iattr object that we know are valid. - */ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); attrs->mask = 0; - if (iattr->ia_valid & ATTR_UID) { - attrs->owner = from_kuid(&init_user_ns, iattr->ia_uid); + if (orangefs_inode->attr_valid & ATTR_UID) { + attrs->owner = from_kuid(&init_user_ns, inode->i_uid); attrs->mask |= ORANGEFS_ATTR_SYS_UID; gossip_debug(GOSSIP_UTILS_DEBUG, "(UID) %d\n", attrs->owner); } - if (iattr->ia_valid & ATTR_GID) { - attrs->group = from_kgid(&init_user_ns, iattr->ia_gid); + if (orangefs_inode->attr_valid & ATTR_GID) { + attrs->group = from_kgid(&init_user_ns, inode->i_gid); attrs->mask |= ORANGEFS_ATTR_SYS_GID; gossip_debug(GOSSIP_UTILS_DEBUG, "(GID) %d\n", attrs->group); } - if (iattr->ia_valid & ATTR_ATIME) { + if (orangefs_inode->attr_valid & ATTR_ATIME) { attrs->mask |= ORANGEFS_ATTR_SYS_ATIME; - if (iattr->ia_valid & ATTR_ATIME_SET) { - attrs->atime = (time64_t)iattr->ia_atime.tv_sec; + if (orangefs_inode->attr_valid & ATTR_ATIME_SET) { + attrs->atime = (time64_t)inode->i_atime.tv_sec; attrs->mask |= ORANGEFS_ATTR_SYS_ATIME_SET; } } - if (iattr->ia_valid & ATTR_MTIME) { + if (orangefs_inode->attr_valid & ATTR_MTIME) { attrs->mask |= ORANGEFS_ATTR_SYS_MTIME; - if (iattr->ia_valid & ATTR_MTIME_SET) { - attrs->mtime = (time64_t)iattr->ia_mtime.tv_sec; + if (orangefs_inode->attr_valid & ATTR_MTIME_SET) { + attrs->mtime = (time64_t)inode->i_mtime.tv_sec; attrs->mask |= ORANGEFS_ATTR_SYS_MTIME_SET; } } - if (iattr->ia_valid & ATTR_CTIME) + if (orangefs_inode->attr_valid & ATTR_CTIME) attrs->mask |= ORANGEFS_ATTR_SYS_CTIME; /* @@ -189,36 +175,10 @@ static inline int copy_attributes_from_inode(struct inode *inode, * worry about ATTR_SIZE */ - if (iattr->ia_valid & ATTR_MODE) { - tmp_mode = iattr->ia_mode; - if (tmp_mode & (S_ISVTX)) { - if (is_root_handle(inode)) { - /* - * allow sticky bit to be set on root (since - * it shows up that way by default anyhow), - * but don't show it to the server - */ - tmp_mode -= S_ISVTX; - } else { - gossip_debug(GOSSIP_UTILS_DEBUG, - "%s: setting sticky bit not supported.\n", - __func__); - return -EINVAL; - } - } - - if (tmp_mode & (S_ISUID)) { - gossip_debug(GOSSIP_UTILS_DEBUG, - "%s: setting setuid bit not supported.\n", - __func__); - return -EINVAL; - } - - attrs->perms = ORANGEFS_util_translate_mode(tmp_mode); + if (orangefs_inode->attr_valid & ATTR_MODE) { + attrs->perms = ORANGEFS_util_translate_mode(inode->i_mode); attrs->mask |= ORANGEFS_ATTR_SYS_PERM; } - - return 0; } static int orangefs_inode_type(enum orangefs_ds_type objtype) @@ -283,10 +243,16 @@ int orangefs_inode_getattr(struct inode *inode, int flags) gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU flags %d\n", __func__, get_khandle_from_ino(inode), flags); +again: spin_lock(&inode->i_lock); /* Must have all the attributes in the mask and be within cache time. */ if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) || - inode->i_state & I_DIRTY) { + orangefs_inode->attr_valid) { + if (orangefs_inode->attr_valid) { + spin_unlock(&inode->i_lock); + write_inode_now(inode, 1); + goto again; + } spin_unlock(&inode->i_lock); return 0; } @@ -311,10 +277,16 @@ int orangefs_inode_getattr(struct inode *inode, int flags) if (ret != 0) goto out; +again2: spin_lock(&inode->i_lock); /* Must have all the attributes in the mask and be within cache time. */ if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) || - inode->i_state & I_DIRTY) { + orangefs_inode->attr_valid) { + if (orangefs_inode->attr_valid) { + spin_unlock(&inode->i_lock); + write_inode_now(inode, 1); + goto again2; + } gossip_debug(GOSSIP_UTILS_DEBUG, "%s: in cache or dirty\n", __func__); ret = 0; @@ -438,7 +410,7 @@ out: * issues a orangefs setattr request to make sure the new attribute values * take effect if successful. returns 0 on success; -errno otherwise */ -int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr) +int orangefs_inode_setattr(struct inode *inode) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op; @@ -448,24 +420,26 @@ int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr) if (!new_op) return -ENOMEM; + spin_lock(&inode->i_lock); + new_op->upcall.uid = from_kuid(&init_user_ns, orangefs_inode->attr_uid); + new_op->upcall.gid = from_kgid(&init_user_ns, orangefs_inode->attr_gid); new_op->upcall.req.setattr.refn = orangefs_inode->refn; - ret = copy_attributes_from_inode(inode, - &new_op->upcall.req.setattr.attributes, - iattr); - if (ret >= 0) { - ret = service_operation(new_op, __func__, - get_interruptible_flag(inode)); + copy_attributes_from_inode(inode, + &new_op->upcall.req.setattr.attributes); + orangefs_inode->attr_valid = 0; + spin_unlock(&inode->i_lock); - gossip_debug(GOSSIP_UTILS_DEBUG, - "orangefs_inode_setattr: returning %d\n", - ret); - } + ret = service_operation(new_op, __func__, + get_interruptible_flag(inode)); + gossip_debug(GOSSIP_UTILS_DEBUG, + "orangefs_inode_setattr: returning %d\n", ret); + if (ret) + orangefs_make_bad_inode(inode); op_release(new_op); if (ret == 0) orangefs_inode->getattr_time = jiffies - 1; - return ret; } diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index f27da3bbafac..8fa30c13b7ed 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -155,17 +155,8 @@ static void orangefs_destroy_inode(struct inode *inode) static int orangefs_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct iattr iattr; gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_write_inode\n"); - iattr.ia_valid = ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_ATIME | - ATTR_ATIME_SET | ATTR_MTIME | ATTR_MTIME_SET | ATTR_CTIME; - iattr.ia_mode = inode->i_mode; - iattr.ia_uid = inode->i_uid; - iattr.ia_gid = inode->i_gid; - iattr.ia_atime = inode->i_atime; - iattr.ia_mtime = inode->i_mtime; - iattr.ia_ctime = inode->i_ctime; - return orangefs_inode_setattr(inode, &iattr); + return orangefs_inode_setattr(inode); } /* From a68d9c606a67952fe547399bc45eebc8d83078bd Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Thu, 15 Feb 2018 18:02:43 +0000 Subject: [PATCH 09/23] orangefs: remove orangefs_readpages It's a copy of the loop which would run in read_pages from mm/readahead.c. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/inode.c | 39 +-------------------------------------- 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 2708bf8af9cf..fd23a8ca641c 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -15,7 +15,7 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" -static int read_one_page(struct page *page) +static int orangefs_readpage(struct file *file, struct page *page) { int ret; int max_block; @@ -60,42 +60,6 @@ static int read_one_page(struct page *page) return ret; } -static int orangefs_readpage(struct file *file, struct page *page) -{ - return read_one_page(page); -} - -static int orangefs_readpages(struct file *file, - struct address_space *mapping, - struct list_head *pages, - unsigned nr_pages) -{ - int page_idx; - int ret; - - gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_readpages called\n"); - - for (page_idx = 0; page_idx < nr_pages; page_idx++) { - struct page *page; - - page = lru_to_page(pages); - list_del(&page->lru); - if (!add_to_page_cache(page, - mapping, - page->index, - readahead_gfp_mask(mapping))) { - ret = read_one_page(page); - gossip_debug(GOSSIP_INODE_DEBUG, - "failure adding page to cache, read_one_page returned: %d\n", - ret); - } else { - put_page(page); - } - } - BUG_ON(!list_empty(pages)); - return 0; -} - static void orangefs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) @@ -141,7 +105,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, /** ORANGEFS2 implementation of address space operations */ static const struct address_space_operations orangefs_address_operations = { .readpage = orangefs_readpage, - .readpages = orangefs_readpages, .invalidatepage = orangefs_invalidatepage, .releasepage = orangefs_releasepage, .direct_IO = orangefs_direct_IO, From 0dcac0f7812b2c09ed018a5eba91448a37f1b71b Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Thu, 15 Feb 2018 19:38:01 +0000 Subject: [PATCH 10/23] orangefs: service ops done for writeback are not killable Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/orangefs-kernel.h | 1 + fs/orangefs/orangefs-utils.c | 2 +- fs/orangefs/waitqueue.c | 18 ++++++++++-------- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index a74d9e8c5f9e..46b9ad1d2a9b 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -442,6 +442,7 @@ extern const struct dentry_operations orangefs_dentry_operations; #define ORANGEFS_OP_CANCELLATION 4 /* this is a cancellation */ #define ORANGEFS_OP_NO_MUTEX 8 /* don't acquire request_mutex */ #define ORANGEFS_OP_ASYNC 16 /* Queue it, but don't wait */ +#define ORANGEFS_OP_WRITEBACK 32 int service_operation(struct orangefs_kernel_op_s *op, const char *op_name, diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index a4fac527f85d..9221c4a3398e 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -430,7 +430,7 @@ int orangefs_inode_setattr(struct inode *inode) spin_unlock(&inode->i_lock); ret = service_operation(new_op, __func__, - get_interruptible_flag(inode)); + get_interruptible_flag(inode) | ORANGEFS_OP_WRITEBACK); gossip_debug(GOSSIP_UTILS_DEBUG, "orangefs_inode_setattr: returning %d\n", ret); if (ret) diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c index 0729d2645d6a..beafc33d57be 100644 --- a/fs/orangefs/waitqueue.c +++ b/fs/orangefs/waitqueue.c @@ -19,7 +19,7 @@ static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op, long timeout, - bool interruptible) + int flags) __acquires(op->lock); static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s *op) __releases(op->lock); @@ -143,9 +143,7 @@ retry_servicing: if (!(flags & ORANGEFS_OP_NO_MUTEX)) mutex_unlock(&orangefs_request_mutex); - ret = wait_for_matching_downcall(op, timeout, - flags & ORANGEFS_OP_INTERRUPTIBLE); - + ret = wait_for_matching_downcall(op, timeout, flags); gossip_debug(GOSSIP_WAIT_DEBUG, "%s: wait_for_matching_downcall returned %d for %p\n", __func__, @@ -319,10 +317,12 @@ static void */ static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op, long timeout, - bool interruptible) + int flags) __acquires(op->lock) { long n; + int writeback = flags & ORANGEFS_OP_WRITEBACK, + interruptible = flags & ORANGEFS_OP_INTERRUPTIBLE; /* * There's a "schedule_timeout" inside of these wait @@ -330,10 +330,12 @@ static int wait_for_matching_downcall(struct orangefs_kernel_op_s *op, * user process that needs something done and is being * manipulated by the client-core process. */ - if (interruptible) + if (writeback) + n = wait_for_completion_io_timeout(&op->waitq, timeout); + else if (!writeback && interruptible) n = wait_for_completion_interruptible_timeout(&op->waitq, - timeout); - else + timeout); + else /* !writeback && !interruptible but compiler complains */ n = wait_for_completion_killable_timeout(&op->waitq, timeout); spin_lock(&op->lock); From c453dcfc79815760071bd9a7805d4b809fec05cf Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Fri, 16 Feb 2018 20:51:24 +0000 Subject: [PATCH 11/23] orangefs: migrate to generic_file_read_iter Remove orangefs_inode_read. It was used by readpage. Calling wait_for_direct_io directly serves the purpose just as well. There is now no check of the bufmap size in the readpage path. There are already other places the bufmap size is assumed to be greater than PAGE_SIZE. Important to call truncate_inode_pages now in the write path so a subsequent read sees the new data. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 66 ++++------------------------------- fs/orangefs/inode.c | 59 +++++++++++-------------------- fs/orangefs/orangefs-kernel.h | 13 ++++--- 3 files changed, 36 insertions(+), 102 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index a9e69c56d2fb..934f102ce9e1 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -44,7 +44,7 @@ static int flush_racache(struct inode *inode) /* * Post and wait for the I/O upcall to finish */ -static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, +ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, loff_t *offset, struct iov_iter *iter, size_t total_size, loff_t readahead_size) { @@ -240,7 +240,7 @@ out: * augmented/extended metadata attached to the file. * Note: File extended attributes override any mount options. */ -static ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file, +ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file, loff_t *offset, struct iov_iter *iter) { struct inode *inode = file->f_mapping->host; @@ -341,65 +341,11 @@ out: return ret; } -/* - * Read data from a specified offset in a file (referenced by inode). - * Data may be placed either in a user or kernel buffer. - */ -ssize_t orangefs_inode_read(struct inode *inode, - struct iov_iter *iter, - loff_t *offset, - loff_t readahead_size) +static ssize_t orangefs_file_read_iter(struct kiocb *iocb, + struct iov_iter *iter) { - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); - size_t count = iov_iter_count(iter); - size_t bufmap_size; - ssize_t ret = -EINVAL; - orangefs_stats.reads++; - - bufmap_size = orangefs_bufmap_size_query(); - if (count > bufmap_size) { - gossip_debug(GOSSIP_FILE_DEBUG, - "%s: count is too large (%zd/%zd)!\n", - __func__, count, bufmap_size); - return -EINVAL; - } - - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU) %zd@%llu\n", - __func__, - &orangefs_inode->refn.khandle, - count, - llu(*offset)); - - ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, offset, iter, - count, readahead_size); - if (ret > 0) - *offset += ret; - - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): Value(%zd) returned.\n", - __func__, - &orangefs_inode->refn.khandle, - ret); - - return ret; -} - -static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - loff_t pos = iocb->ki_pos; - ssize_t rc = 0; - - gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_read_iter\n"); - - orangefs_stats.reads++; - - rc = do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter); - iocb->ki_pos = pos; - - return rc; + return generic_file_read_iter(iocb, iter); } static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) @@ -408,6 +354,8 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite loff_t pos; ssize_t rc; + truncate_inode_pages(file->f_mapping, 0); + gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n"); inode_lock(file->f_mapping->host); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index fd23a8ca641c..31ee3cb67fe0 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -17,37 +17,25 @@ static int orangefs_readpage(struct file *file, struct page *page) { - int ret; - int max_block; - ssize_t bytes_read = 0; struct inode *inode = page->mapping->host; - const __u32 blocksize = PAGE_SIZE; - const __u32 blockbits = PAGE_SHIFT; - struct iov_iter to; - struct bio_vec bv = {.bv_page = page, .bv_len = PAGE_SIZE}; + struct iov_iter iter; + struct bio_vec bv; + ssize_t ret; + loff_t off; - iov_iter_bvec(&to, READ, &bv, 1, PAGE_SIZE); + off = page_offset(page); + bv.bv_page = page; + bv.bv_len = PAGE_SIZE; + bv.bv_offset = 0; + iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE); - gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_readpage called with page %p\n", - page); - - max_block = ((inode->i_size / blocksize) + 1); - - if (page->index < max_block) { - loff_t blockptr_offset = (((loff_t) page->index) << blockbits); - - bytes_read = orangefs_inode_read(inode, - &to, - &blockptr_offset, - inode->i_size); - } + ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, + PAGE_SIZE, inode->i_size); /* this will only zero remaining unread portions of the page data */ - iov_iter_zero(~0U, &to); + iov_iter_zero(~0U, &iter); /* takes care of potential aliasing */ flush_dcache_page(page); - if (bytes_read < 0) { - ret = bytes_read; + if (ret < 0) { SetPageError(page); } else { SetPageUptodate(page); @@ -84,22 +72,17 @@ static int orangefs_releasepage(struct page *page, gfp_t foo) return 0; } -/* - * Having a direct_IO entry point in the address_space_operations - * struct causes the kernel to allows us to use O_DIRECT on - * open. Nothing will ever call this thing, but in the future we - * will need to be able to use O_DIRECT on open in order to support - * AIO. Modeled after NFS, they do this too. - */ - static ssize_t orangefs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { - gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_direct_IO: %pD\n", - iocb->ki_filp); - - return -EINVAL; + struct file *file = iocb->ki_filp; + loff_t pos = *(&iocb->ki_pos); + /* + * This cannot happen until write_iter becomes + * generic_file_write_iter. + */ + BUG_ON(iov_iter_rw(iter) != READ); + return do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter); } /** ORANGEFS2 implementation of address space operations */ diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 46b9ad1d2a9b..307bbb61819a 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -369,11 +369,6 @@ ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size); struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref *ref); -ssize_t orangefs_inode_read(struct inode *inode, - struct iov_iter *iter, - loff_t *offset, - loff_t readahead_size); - /* * defined in devorangefs-req.c */ @@ -384,6 +379,14 @@ void orangefs_dev_cleanup(void); int is_daemon_in_service(void); bool __is_daemon_in_service(void); +/* + * defined in file.c + */ +ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *, + struct iov_iter *, size_t, loff_t); +ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *, + struct iov_iter *); + /* * defined in orangefs-utils.c */ From 85ac799cf926a589829ebe6274bb5e5a41159743 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Thu, 22 Feb 2018 18:10:43 +0000 Subject: [PATCH 12/23] orangefs: implement writepage Now orangefs_inode_getattr fills from cache if an inode has dirty pages. also if attr_valid and dirty pages and !flags, we spin on inode writeback before returning if pages still dirty after: should it be other way Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 75 ++++++++---------------------------- fs/orangefs/inode.c | 65 ++++++++++++++++++++++++++++--- fs/orangefs/orangefs-utils.c | 8 +++- 3 files changed, 80 insertions(+), 68 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 934f102ce9e1..d8c97b87bf26 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago + * Copyright 2018 Omnibond Systems, L.L.C. * * See COPYING in top-level directory. */ @@ -348,63 +349,11 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, return generic_file_read_iter(iocb, iter); } -static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) +static ssize_t orangefs_file_write_iter(struct kiocb *iocb, + struct iov_iter *iter) { - struct file *file = iocb->ki_filp; - loff_t pos; - ssize_t rc; - - truncate_inode_pages(file->f_mapping, 0); - - gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n"); - - inode_lock(file->f_mapping->host); - - /* Make sure generic_write_checks sees an up to date inode size. */ - if (file->f_flags & O_APPEND) { - rc = orangefs_inode_getattr(file->f_mapping->host, - ORANGEFS_GETATTR_SIZE); - if (rc == -ESTALE) - rc = -EIO; - if (rc) { - gossip_err("%s: orangefs_inode_getattr failed, " - "rc:%zd:.\n", __func__, rc); - goto out; - } - } - - rc = generic_write_checks(iocb, iter); - - if (rc <= 0) { - gossip_err("%s: generic_write_checks failed, rc:%zd:.\n", - __func__, rc); - goto out; - } - - /* - * if we are appending, generic_write_checks would have updated - * pos to the end of the file, so we will wait till now to set - * pos... - */ - pos = iocb->ki_pos; - - rc = do_readv_writev(ORANGEFS_IO_WRITE, - file, - &pos, - iter); - if (rc < 0) { - gossip_err("%s: do_readv_writev failed, rc:%zd:.\n", - __func__, rc); - goto out; - } - - iocb->ki_pos = pos; orangefs_stats.writes++; - -out: - - inode_unlock(file->f_mapping->host); - return rc; + return generic_file_write_iter(iocb, iter); } /* @@ -499,9 +448,6 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) (char *)file->f_path.dentry->d_name.name : (char *)"Unknown")); - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) - return -EINVAL; - /* set the sequential readahead hint */ vma->vm_flags |= VM_SEQ_READ; vma->vm_flags &= ~VM_RAND_READ; @@ -541,8 +487,6 @@ static int orangefs_file_release(struct inode *inode, struct file *file) gossip_debug(GOSSIP_INODE_DEBUG, "flush_racache finished\n"); } - truncate_inode_pages(file_inode(file)->i_mapping, - 0); } return 0; } @@ -560,6 +504,11 @@ static int orangefs_fsync(struct file *file, ORANGEFS_I(file_inode(file)); struct orangefs_kernel_op_s *new_op = NULL; + ret = filemap_write_and_wait_range(file_inode(file)->i_mapping, + start, end); + if (ret < 0) + return ret; + new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC); if (!new_op) return -ENOMEM; @@ -641,6 +590,11 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) return rc; } +static int orangefs_flush(struct file *file, fl_owner_t id) +{ + return vfs_fsync(file, 0); +} + /** ORANGEFS implementation of VFS file operations */ const struct file_operations orangefs_file_operations = { .llseek = orangefs_file_llseek, @@ -650,6 +604,7 @@ const struct file_operations orangefs_file_operations = { .unlocked_ioctl = orangefs_ioctl, .mmap = orangefs_file_mmap, .open = generic_file_open, + .flush = orangefs_flush, .release = orangefs_file_release, .fsync = orangefs_fsync, }; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 31ee3cb67fe0..3b54974817ea 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -15,6 +15,50 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" +static int orangefs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct iov_iter iter; + struct bio_vec bv; + size_t len, wlen; + ssize_t ret; + loff_t off; + + set_page_writeback(page); + + off = page_offset(page); + len = i_size_read(inode); + if (off > len) { + /* The file was truncated; there is nothing to write. */ + unlock_page(page); + end_page_writeback(page); + return 0; + } + if (off + PAGE_SIZE > len) + wlen = len - off; + else + wlen = PAGE_SIZE; + + bv.bv_page = page; + bv.bv_len = wlen; + bv.bv_offset = off % PAGE_SIZE; + if (wlen == 0) + dump_stack(); + iov_iter_bvec(&iter, WRITE, &bv, 1, wlen); + + ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, + len); + if (ret < 0) { + SetPageError(page); + mapping_set_error(page->mapping, ret); + } else { + ret = 0; + } + unlock_page(page); + end_page_writeback(page); + return ret; +} + static int orangefs_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; @@ -48,6 +92,15 @@ static int orangefs_readpage(struct file *file, struct page *page) return ret; } +static int orangefs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) +{ + int r; + r = simple_write_end(file, mapping, pos, len, copied, page, fsdata); + mark_inode_dirty_sync(file_inode(file)); + return r; +} + static void orangefs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) @@ -77,17 +130,17 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, { struct file *file = iocb->ki_filp; loff_t pos = *(&iocb->ki_pos); - /* - * This cannot happen until write_iter becomes - * generic_file_write_iter. - */ - BUG_ON(iov_iter_rw(iter) != READ); - return do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter); + return do_readv_writev(iov_iter_rw(iter) == WRITE ? + ORANGEFS_IO_WRITE : ORANGEFS_IO_READ, file, &pos, iter); } /** ORANGEFS2 implementation of address space operations */ static const struct address_space_operations orangefs_address_operations = { + .writepage = orangefs_writepage, .readpage = orangefs_readpage, + .set_page_dirty = __set_page_dirty_nobuffers, + .write_begin = simple_write_begin, + .write_end = orangefs_write_end, .invalidatepage = orangefs_invalidatepage, .releasepage = orangefs_releasepage, .direct_IO = orangefs_direct_IO, diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 9221c4a3398e..d6093a468db9 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -247,7 +247,7 @@ again: spin_lock(&inode->i_lock); /* Must have all the attributes in the mask and be within cache time. */ if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) || - orangefs_inode->attr_valid) { + orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) { if (orangefs_inode->attr_valid) { spin_unlock(&inode->i_lock); write_inode_now(inode, 1); @@ -281,12 +281,16 @@ again2: spin_lock(&inode->i_lock); /* Must have all the attributes in the mask and be within cache time. */ if ((!flags && time_before(jiffies, orangefs_inode->getattr_time)) || - orangefs_inode->attr_valid) { + orangefs_inode->attr_valid || inode->i_state & I_DIRTY_PAGES) { if (orangefs_inode->attr_valid) { spin_unlock(&inode->i_lock); write_inode_now(inode, 1); goto again2; } + if (inode->i_state & I_DIRTY_PAGES) { + ret = 0; + goto out_unlock; + } gossip_debug(GOSSIP_UTILS_DEBUG, "%s: in cache or dirty\n", __func__); ret = 0; From 43f34576042eb3256c39b502b22c6755144f7517 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 6 Nov 2018 19:51:39 +0000 Subject: [PATCH 13/23] orangefs: do not return successful read when the client-core disappeared Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index d8c97b87bf26..0af9f0b42d80 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -169,7 +169,10 @@ populate_shared_memory: * trigger the write. */ case OP_VFS_STATE_INPROGR: - ret = total_size; + if (type == ORANGEFS_IO_READ) + ret = -EINTR; + else + ret = total_size; break; default: gossip_err("%s: unexpected op state :%d:.\n", From 3e9dfc6e1e8bce62a329f1452c7eeccbac230980 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 6 Nov 2018 19:54:49 +0000 Subject: [PATCH 14/23] orangefs: move do_readv_writev to direct_IO direct_IO was the only caller and all direct_IO did was call it, so there's no use in having the code spread out into so many functions. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 108 ----------------------------------------- fs/orangefs/inode.c | 114 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 111 insertions(+), 111 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 0af9f0b42d80..f4e20d5ed207 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -237,114 +237,6 @@ out: return ret; } -/* - * Common entry point for read/write/readv/writev - * This function will dispatch it to either the direct I/O - * or buffered I/O path depending on the mount options and/or - * augmented/extended metadata attached to the file. - * Note: File extended attributes override any mount options. - */ -ssize_t do_readv_writev(enum ORANGEFS_io_type type, struct file *file, - loff_t *offset, struct iov_iter *iter) -{ - struct inode *inode = file->f_mapping->host; - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); - struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; - size_t count = iov_iter_count(iter); - ssize_t total_count = 0; - ssize_t ret = -EINVAL; - - gossip_debug(GOSSIP_FILE_DEBUG, - "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n", - __func__, - handle, - (int)count); - - if (type == ORANGEFS_IO_WRITE) { - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): proceeding with offset : %llu, " - "size %d\n", - __func__, - handle, - llu(*offset), - (int)count); - } - - if (count == 0) { - ret = 0; - goto out; - } - - while (iov_iter_count(iter)) { - size_t each_count = iov_iter_count(iter); - size_t amt_complete; - - /* how much to transfer in this loop iteration */ - if (each_count > orangefs_bufmap_size_query()) - each_count = orangefs_bufmap_size_query(); - - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): size of each_count(%d)\n", - __func__, - handle, - (int)each_count); - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): BEFORE wait_for_io: offset is %d\n", - __func__, - handle, - (int)*offset); - - ret = wait_for_direct_io(type, inode, offset, iter, - each_count, 0); - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): return from wait_for_io:%d\n", - __func__, - handle, - (int)ret); - - if (ret < 0) - goto out; - - *offset += ret; - total_count += ret; - amt_complete = ret; - - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): AFTER wait_for_io: offset is %d\n", - __func__, - handle, - (int)*offset); - - /* - * if we got a short I/O operations, - * fall out and return what we got so far - */ - if (amt_complete < each_count) - break; - } /*end while */ - -out: - if (total_count > 0) - ret = total_count; - if (ret > 0) { - if (type == ORANGEFS_IO_READ) { - file_accessed(file); - } else { - file_update_time(file); - if (*offset > i_size_read(inode)) - i_size_write(inode, *offset); - } - } - - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): Value(%d) returned.\n", - __func__, - handle, - (int)ret); - - return ret; -} - static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 3b54974817ea..1c72aa38317d 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -128,10 +128,118 @@ static int orangefs_releasepage(struct page *page, gfp_t foo) static ssize_t orangefs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { + /* + * Comment from original do_readv_writev: + * Common entry point for read/write/readv/writev + * This function will dispatch it to either the direct I/O + * or buffered I/O path depending on the mount options and/or + * augmented/extended metadata attached to the file. + * Note: File extended attributes override any mount options. + */ struct file *file = iocb->ki_filp; - loff_t pos = *(&iocb->ki_pos); - return do_readv_writev(iov_iter_rw(iter) == WRITE ? - ORANGEFS_IO_WRITE : ORANGEFS_IO_READ, file, &pos, iter); + loff_t pos = iocb->ki_pos; + enum ORANGEFS_io_type type = iov_iter_rw(iter) == WRITE ? + ORANGEFS_IO_WRITE : ORANGEFS_IO_READ; + loff_t *offset = &pos; + struct inode *inode = file->f_mapping->host; + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; + size_t count = iov_iter_count(iter); + size_t ORIGINALcount = iov_iter_count(iter); + ssize_t total_count = 0; + ssize_t ret = -EINVAL; + int i = 0; + + gossip_debug(GOSSIP_FILE_DEBUG, + "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n", + __func__, + handle, + (int)count); + + if (type == ORANGEFS_IO_WRITE) { + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): proceeding with offset : %llu, " + "size %d\n", + __func__, + handle, + llu(*offset), + (int)count); + } + + if (count == 0) { + ret = 0; + goto out; + } + + while (iov_iter_count(iter)) { + size_t each_count = iov_iter_count(iter); + size_t amt_complete; + i++; + + /* how much to transfer in this loop iteration */ + if (each_count > orangefs_bufmap_size_query()) + each_count = orangefs_bufmap_size_query(); + + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): size of each_count(%d)\n", + __func__, + handle, + (int)each_count); + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): BEFORE wait_for_io: offset is %d\n", + __func__, + handle, + (int)*offset); + + ret = wait_for_direct_io(type, inode, offset, iter, + each_count, 0); + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): return from wait_for_io:%d\n", + __func__, + handle, + (int)ret); + + if (ret < 0) + goto out; + + *offset += ret; + total_count += ret; + amt_complete = ret; + + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): AFTER wait_for_io: offset is %d\n", + __func__, + handle, + (int)*offset); + + /* + * if we got a short I/O operations, + * fall out and return what we got so far + */ + if (amt_complete < each_count) + break; + } /*end while */ + +out: + if (total_count > 0) + ret = total_count; + if (ret > 0) { + if (type == ORANGEFS_IO_READ) { + file_accessed(file); + } else { + file_update_time(file); + if (*offset > i_size_read(inode)) + i_size_write(inode, *offset); + } + } + + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): Value(%d) returned.\n", + __func__, + handle, + (int)ret); + + return ret; } /** ORANGEFS2 implementation of address space operations */ From 8a88bbce6f83430a737a97bb72d0912a6a103945 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Thu, 22 Feb 2018 18:15:56 +0000 Subject: [PATCH 15/23] orangefs: skip inode writeout if nothing to write Would happen if an inode is dirty but whatever happened is not something that can be written out to OrangeFS. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/orangefs-utils.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index d6093a468db9..d4b7ae763186 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -431,6 +431,11 @@ int orangefs_inode_setattr(struct inode *inode) copy_attributes_from_inode(inode, &new_op->upcall.req.setattr.attributes); orangefs_inode->attr_valid = 0; + if (!new_op->upcall.req.setattr.attributes.mask) { + spin_unlock(&inode->i_lock); + op_release(new_op); + return 0; + } spin_unlock(&inode->i_lock); ret = service_operation(new_op, __func__, From 90fc07065a3505e5a874c5854fd6176beb545e08 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Mon, 26 Mar 2018 18:58:11 +0000 Subject: [PATCH 16/23] orangefs: avoid fsync service operation on flush Without this, an fsync call is sent to the server even if no data changed. This resulted in a rather severe (50%) performance regression under certain metadata-heavy workloads. In the past, everything was direct IO. Nothing happend on a close call. An explicit fsync call would send an fsync request to the server which in turn fsynced the underlying file. Now there are cached writes. Then fsync began writing out dirty pages in addition to making an fsync request to the server, and close began calling fsync. With this commit, close only writes out dirty pages, and does not make the fsync request. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index f4e20d5ed207..26d8ff410b0a 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -487,7 +487,29 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) static int orangefs_flush(struct file *file, fl_owner_t id) { - return vfs_fsync(file, 0); + /* + * This is vfs_fsync_range(file, 0, LLONG_MAX, 0) without the + * service_operation in orangefs_fsync. + * + * Do not send fsync to OrangeFS server on a close. Do send fsync + * on an explicit fsync call. This duplicates historical OrangeFS + * behavior. + */ + struct inode *inode = file->f_mapping->host; + int r; + + if (inode->i_state & I_DIRTY_TIME) { + spin_lock(&inode->i_lock); + inode->i_state &= ~I_DIRTY_TIME; + spin_unlock(&inode->i_lock); + mark_inode_dirty_sync(inode); + } + + r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX); + if (r > 0) + return 0; + else + return r; } /** ORANGEFS implementation of VFS file operations */ From 52e2d0a3804c095775b178d6b0707ef6ac8e6d04 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Fri, 14 Dec 2018 15:24:43 -0500 Subject: [PATCH 17/23] orangefs: write range tracking Attach the actual range of bytes written to plus the responsible uid/gid to each dirty page. This information must be sent to the server when the page is written out. Now write_begin, page_mkwrite, and invalidatepage keep up with this information. There are several conditions where they must write out the page immediately to store the new range. Two non-contiguous ranges cannot be stored on a single page. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 10 +- fs/orangefs/inode.c | 289 ++++++++++++++++++++++++++++++---- fs/orangefs/orangefs-kernel.h | 10 +- 3 files changed, 274 insertions(+), 35 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 26d8ff410b0a..f409ac5d3410 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -46,8 +46,8 @@ static int flush_racache(struct inode *inode) * Post and wait for the I/O upcall to finish */ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, - loff_t *offset, struct iov_iter *iter, - size_t total_size, loff_t readahead_size) + loff_t *offset, struct iov_iter *iter, size_t total_size, + loff_t readahead_size, struct orangefs_write_range *wr) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; @@ -85,6 +85,10 @@ populate_shared_memory: new_op->upcall.req.io.buf_index = buffer_index; new_op->upcall.req.io.count = total_size; new_op->upcall.req.io.offset = *offset; + if (type == ORANGEFS_IO_WRITE && wr) { + new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid); + new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid); + } gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): offset: %llu total_size: %zd\n", @@ -329,7 +333,7 @@ static vm_fault_t orangefs_fault(struct vm_fault *vmf) static const struct vm_operations_struct orangefs_file_vm_ops = { .fault = orangefs_fault, .map_pages = filemap_map_pages, - .page_mkwrite = filemap_page_mkwrite, + .page_mkwrite = orangefs_page_mkwrite, }; /* diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 1c72aa38317d..add9c569a7dc 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -15,9 +15,11 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" -static int orangefs_writepage(struct page *page, struct writeback_control *wbc) +static int orangefs_writepage_locked(struct page *page, + struct writeback_control *wbc) { struct inode *inode = page->mapping->host; + struct orangefs_write_range *wr = NULL; struct iov_iter iter; struct bio_vec bv; size_t len, wlen; @@ -26,34 +28,52 @@ static int orangefs_writepage(struct page *page, struct writeback_control *wbc) set_page_writeback(page); - off = page_offset(page); len = i_size_read(inode); - if (off > len) { - /* The file was truncated; there is nothing to write. */ - unlock_page(page); - end_page_writeback(page); - return 0; + if (PagePrivate(page)) { + wr = (struct orangefs_write_range *)page_private(page); + off = wr->pos; + if (off + wr->len > len) + wlen = len - off; + else + wlen = wr->len; + } else { + WARN_ON(1); + off = page_offset(page); + if (off + PAGE_SIZE > len) + wlen = len - off; + else + wlen = PAGE_SIZE; } - if (off + PAGE_SIZE > len) - wlen = len - off; - else - wlen = PAGE_SIZE; + /* Should've been handled in orangefs_invalidatepage. */ + WARN_ON(off == len || off + wlen > len); bv.bv_page = page; bv.bv_len = wlen; bv.bv_offset = off % PAGE_SIZE; - if (wlen == 0) - dump_stack(); + WARN_ON(wlen == 0); iov_iter_bvec(&iter, WRITE, &bv, 1, wlen); ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, - len); + len, wr); if (ret < 0) { SetPageError(page); mapping_set_error(page->mapping, ret); } else { ret = 0; } + if (wr) { + kfree(wr); + set_page_private(page, 0); + ClearPagePrivate(page); + put_page(page); + } + return ret; +} + +static int orangefs_writepage(struct page *page, struct writeback_control *wbc) +{ + int ret; + ret = orangefs_writepage_locked(page, wbc); unlock_page(page); end_page_writeback(page); return ret; @@ -74,7 +94,7 @@ static int orangefs_readpage(struct file *file, struct page *page) iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE); ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, - PAGE_SIZE, inode->i_size); + PAGE_SIZE, inode->i_size, NULL); /* this will only zero remaining unread portions of the page data */ iov_iter_zero(~0U, &iter); /* takes care of potential aliasing */ @@ -92,6 +112,73 @@ static int orangefs_readpage(struct file *file, struct page *page) return ret; } +static int orangefs_launder_page(struct page *); + +static int orangefs_write_begin(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, struct page **pagep, + void **fsdata) +{ + struct orangefs_write_range *wr; + struct page *page; + pgoff_t index; + int ret; + + index = pos >> PAGE_SHIFT; + + page = grab_cache_page_write_begin(mapping, index, flags); + if (!page) + return -ENOMEM; + + *pagep = page; + + if (PageDirty(page) && !PagePrivate(page)) { + /* + * Should be impossible. If it happens, launder the page + * since we don't know what's dirty. This will WARN in + * orangefs_writepage_locked. + */ + ret = orangefs_launder_page(page); + if (ret) + return ret; + } + if (PagePrivate(page)) { + struct orangefs_write_range *wr; + wr = (struct orangefs_write_range *)page_private(page); + if (wr->pos + wr->len == pos && + uid_eq(wr->uid, current_fsuid()) && + gid_eq(wr->gid, current_fsgid())) { + wr->len += len; + goto okay; + } else { + ret = orangefs_launder_page(page); + if (ret) + return ret; + } + + } + + wr = kmalloc(sizeof *wr, GFP_KERNEL); + if (!wr) + return -ENOMEM; + + wr->pos = pos; + wr->len = len; + wr->uid = current_fsuid(); + wr->gid = current_fsgid(); + SetPagePrivate(page); + set_page_private(page, (unsigned long)wr); + get_page(page); +okay: + + if (!PageUptodate(page) && (len != PAGE_SIZE)) { + unsigned from = pos & (PAGE_SIZE - 1); + + zero_user_segments(page, 0, from, from + len, PAGE_SIZE); + } + return 0; +} + static int orangefs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { @@ -105,24 +192,96 @@ static void orangefs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_invalidatepage called on page %p " - "(offset is %u)\n", - page, - offset); - - ClearPageUptodate(page); - ClearPageMappedToDisk(page); - return; + struct orangefs_write_range *wr; + wr = (struct orangefs_write_range *)page_private(page); + if (offset == 0 && length == PAGE_SIZE) { + kfree((struct orangefs_write_range *)page_private(page)); + set_page_private(page, 0); + ClearPagePrivate(page); + put_page(page); + /* write range entirely within invalidate range (or equal) */ + } else if (page_offset(page) + offset <= wr->pos && + wr->pos + wr->len <= page_offset(page) + offset + length) { + kfree((struct orangefs_write_range *)page_private(page)); + set_page_private(page, 0); + ClearPagePrivate(page); + put_page(page); + /* XXX is this right? only caller in fs */ + cancel_dirty_page(page); + /* invalidate range chops off end of write range */ + } else if (wr->pos < page_offset(page) + offset && + wr->pos + wr->len <= page_offset(page) + offset + length && + page_offset(page) + offset < wr->pos + wr->len) { + size_t x; + x = wr->pos + wr->len - (page_offset(page) + offset); + WARN_ON(x > wr->len); + wr->len -= x; + wr->uid = current_fsuid(); + wr->gid = current_fsgid(); + /* invalidate range chops off beginning of write range */ + } else if (page_offset(page) + offset <= wr->pos && + page_offset(page) + offset + length < wr->pos + wr->len && + wr->pos < page_offset(page) + offset + length) { + size_t x; + x = page_offset(page) + offset + length - wr->pos; + WARN_ON(x > wr->len); + wr->pos += x; + wr->len -= x; + wr->uid = current_fsuid(); + wr->gid = current_fsgid(); + /* invalidate range entirely within write range (punch hole) */ + } else if (wr->pos < page_offset(page) + offset && + page_offset(page) + offset + length < wr->pos + wr->len) { + /* XXX what do we do here... should not WARN_ON */ + WARN_ON(1); + /* punch hole */ + /* + * should we just ignore this and write it out anyway? + * it hardly makes sense + */ + /* non-overlapping ranges */ + } else { + /* WARN if they do overlap */ + if (!((page_offset(page) + offset + length <= wr->pos) ^ + (wr->pos + wr->len <= page_offset(page) + offset))) { + WARN_ON(1); + printk("invalidate range offset %llu length %u\n", + page_offset(page) + offset, length); + printk("write range offset %llu length %zu\n", + wr->pos, wr->len); + } + } } static int orangefs_releasepage(struct page *page, gfp_t foo) { - gossip_debug(GOSSIP_INODE_DEBUG, - "orangefs_releasepage called on page %p\n", - page); - return 0; + return !PagePrivate(page); +} + +static void orangefs_freepage(struct page *page) +{ + if (PagePrivate(page)) { + kfree((struct orangefs_write_range *)page_private(page)); + set_page_private(page, 0); + ClearPagePrivate(page); + put_page(page); + } +} + +static int orangefs_launder_page(struct page *page) +{ + int r = 0; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 0, + }; + wait_on_page_writeback(page); + if (clear_page_dirty_for_io(page)) { + r = orangefs_writepage_locked(page, &wbc); + end_page_writeback(page); + } + return r; } static ssize_t orangefs_direct_IO(struct kiocb *iocb, @@ -145,7 +304,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; size_t count = iov_iter_count(iter); - size_t ORIGINALcount = iov_iter_count(iter); ssize_t total_count = 0; ssize_t ret = -EINVAL; int i = 0; @@ -192,7 +350,7 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, (int)*offset); ret = wait_for_direct_io(type, inode, offset, iter, - each_count, 0); + each_count, 0, NULL); gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): return from wait_for_io:%d\n", __func__, @@ -247,13 +405,82 @@ static const struct address_space_operations orangefs_address_operations = { .writepage = orangefs_writepage, .readpage = orangefs_readpage, .set_page_dirty = __set_page_dirty_nobuffers, - .write_begin = simple_write_begin, + .write_begin = orangefs_write_begin, .write_end = orangefs_write_end, .invalidatepage = orangefs_invalidatepage, .releasepage = orangefs_releasepage, + .freepage = orangefs_freepage, + .launder_page = orangefs_launder_page, .direct_IO = orangefs_direct_IO, }; +vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) +{ + struct page *page = vmf->page; + struct inode *inode = file_inode(vmf->vma->vm_file); + vm_fault_t ret = VM_FAULT_LOCKED; + struct orangefs_write_range *wr; + + lock_page(page); + if (PageDirty(page) && !PagePrivate(page)) { + /* + * Should be impossible. If it happens, launder the page + * since we don't know what's dirty. This will WARN in + * orangefs_writepage_locked. + */ + if (orangefs_launder_page(page)) { + ret = VM_FAULT_RETRY; + goto out; + } + } + if (PagePrivate(page)) { + wr = (struct orangefs_write_range *)page_private(page); + if (uid_eq(wr->uid, current_fsuid()) && + gid_eq(wr->gid, current_fsgid())) { + wr->pos = page_offset(page); + wr->len = PAGE_SIZE; + goto okay; + } else { + if (orangefs_launder_page(page)) { + ret = VM_FAULT_RETRY; + goto out; + } + } + } + wr = kmalloc(sizeof *wr, GFP_KERNEL); + if (!wr) { + ret = VM_FAULT_RETRY; + goto out; + } + wr->pos = page_offset(page); + wr->len = PAGE_SIZE; + wr->uid = current_fsuid(); + wr->gid = current_fsgid(); + SetPagePrivate(page); + set_page_private(page, (unsigned long)wr); + get_page(page); +okay: + + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); + if (page->mapping != inode->i_mapping) { + unlock_page(page); + ret = VM_FAULT_NOPAGE; + goto out; + } + + /* + * We mark the page dirty already here so that when freeze is in + * progress, we are guaranteed that writeback during freezing will + * see the dirty page and writeprotect it again. + */ + set_page_dirty(page); + wait_for_stable_page(page); +out: + sb_end_pagefault(inode->i_sb); + return ret; +} + static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 307bbb61819a..336a3ec0b83e 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -230,6 +230,13 @@ struct orangefs_cached_xattr { unsigned long timeout; }; +struct orangefs_write_range { + loff_t pos; + size_t len; + kuid_t uid; + kgid_t gid; +}; + extern struct orangefs_stats orangefs_stats; /* @@ -342,6 +349,7 @@ void fsid_key_table_finalize(void); /* * defined in inode.c */ +vm_fault_t orangefs_page_mkwrite(struct vm_fault *); struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, int mode, @@ -383,7 +391,7 @@ bool __is_daemon_in_service(void); * defined in file.c */ ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *, - struct iov_iter *, size_t, loff_t); + struct iov_iter *, size_t, loff_t, struct orangefs_write_range *); ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *, struct iov_iter *); From c472ebc25555e634d89e1ed508d37c9102bff017 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Fri, 14 Dec 2018 17:04:21 -0500 Subject: [PATCH 18/23] orangefs: implement writepages Go through pages and look for a consecutive writable region. After finding a number of consecutive writable pages or when finding that the next page's dirty range is not contiguous and cannot be written as one request, send the write to the server. The number of pages is determined by the client-core's buffer size. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index f409ac5d3410..405449ce4b02 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -386,6 +386,7 @@ static int orangefs_file_release(struct inode *inode, struct file *file) gossip_debug(GOSSIP_INODE_DEBUG, "flush_racache finished\n"); } + } return 0; } From 8f04e1be784858ba0288c7c09b9de06627a800c9 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Tue, 12 Feb 2019 20:19:06 +0000 Subject: [PATCH 19/23] orangefs: add orangefs_revalidate_mapping This is modeled after NFS, except our method is different. We use a simple timer to determine whether to invalidate the page cache. This is bound to perform. This addes a sysfs parameter cache_timeout_msecs which controls the time between page cache invalidations. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/file.c | 70 +++++++++- fs/orangefs/inode.c | 250 +++++++++++++++++++++++++++++++--- fs/orangefs/orangefs-kernel.h | 4 + fs/orangefs/orangefs-mod.c | 1 + fs/orangefs/orangefs-sysfs.c | 22 +++ 5 files changed, 328 insertions(+), 19 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 405449ce4b02..faa5b61cdfd6 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -241,18 +241,78 @@ out: return ret; } +int orangefs_revalidate_mapping(struct inode *inode) +{ + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + struct address_space *mapping = inode->i_mapping; + unsigned long *bitlock = &orangefs_inode->bitlock; + int ret; + + while (1) { + ret = wait_on_bit(bitlock, 1, TASK_KILLABLE); + if (ret) + return ret; + spin_lock(&inode->i_lock); + if (test_bit(1, bitlock)) { + spin_unlock(&inode->i_lock); + continue; + } + if (!time_before(jiffies, orangefs_inode->mapping_time)) + break; + spin_unlock(&inode->i_lock); + return 0; + } + + set_bit(1, bitlock); + smp_wmb(); + spin_unlock(&inode->i_lock); + + unmap_mapping_range(mapping, 0, 0, 0); + ret = filemap_write_and_wait(mapping); + if (!ret) + ret = invalidate_inode_pages2(mapping); + + orangefs_inode->mapping_time = jiffies + + orangefs_cache_timeout_msecs*HZ/1000; + + clear_bit(1, bitlock); + smp_mb__after_atomic(); + wake_up_bit(bitlock, 1); + + return ret; +} + static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { + int ret; orangefs_stats.reads++; - return generic_file_read_iter(iocb, iter); + + down_read(&file_inode(iocb->ki_filp)->i_rwsem); + ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); + if (ret) + goto out; + + ret = generic_file_read_iter(iocb, iter); +out: + up_read(&file_inode(iocb->ki_filp)->i_rwsem); + return ret; } static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) { + int ret; orangefs_stats.writes++; - return generic_file_write_iter(iocb, iter); + + if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) { + ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); + if (ret) + return ret; + } + + ret = generic_file_write_iter(iocb, iter); + return ret; } /* @@ -341,6 +401,12 @@ static const struct vm_operations_struct orangefs_file_vm_ops = { */ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) { + int ret; + + ret = orangefs_revalidate_mapping(file_inode(file)); + if (ret) + return ret; + gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_mmap: called on %s\n", (file ? diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index add9c569a7dc..7ed2ea093c4e 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -31,6 +31,7 @@ static int orangefs_writepage_locked(struct page *page, len = i_size_read(inode); if (PagePrivate(page)) { wr = (struct orangefs_write_range *)page_private(page); + WARN_ON(wr->pos >= len); off = wr->pos; if (off + wr->len > len) wlen = len - off; @@ -79,6 +80,173 @@ static int orangefs_writepage(struct page *page, struct writeback_control *wbc) return ret; } +struct orangefs_writepages { + loff_t off; + size_t len; + kuid_t uid; + kgid_t gid; + int maxpages; + int npages; + struct page **pages; + struct bio_vec *bv; +}; + +static int orangefs_writepages_work(struct orangefs_writepages *ow, + struct writeback_control *wbc) +{ + struct inode *inode = ow->pages[0]->mapping->host; + struct orangefs_write_range *wrp, wr; + struct iov_iter iter; + ssize_t ret; + size_t len; + loff_t off; + int i; + + len = i_size_read(inode); + + for (i = 0; i < ow->npages; i++) { + set_page_writeback(ow->pages[i]); + ow->bv[i].bv_page = ow->pages[i]; + ow->bv[i].bv_len = min(page_offset(ow->pages[i]) + PAGE_SIZE, + ow->off + ow->len) - + max(ow->off, page_offset(ow->pages[i])); + if (i == 0) + ow->bv[i].bv_offset = ow->off - + page_offset(ow->pages[i]); + else + ow->bv[i].bv_offset = 0; + } + iov_iter_bvec(&iter, WRITE, ow->bv, ow->npages, ow->len); + + WARN_ON(ow->off >= len); + if (ow->off + ow->len > len) + ow->len = len - ow->off; + + off = ow->off; + wr.uid = ow->uid; + wr.gid = ow->gid; + ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len, + 0, &wr); + if (ret < 0) { + for (i = 0; i < ow->npages; i++) { + SetPageError(ow->pages[i]); + mapping_set_error(ow->pages[i]->mapping, ret); + if (PagePrivate(ow->pages[i])) { + wrp = (struct orangefs_write_range *) + page_private(ow->pages[i]); + ClearPagePrivate(ow->pages[i]); + put_page(ow->pages[i]); + kfree(wrp); + } + end_page_writeback(ow->pages[i]); + unlock_page(ow->pages[i]); + } + } else { + ret = 0; + for (i = 0; i < ow->npages; i++) { + if (PagePrivate(ow->pages[i])) { + wrp = (struct orangefs_write_range *) + page_private(ow->pages[i]); + ClearPagePrivate(ow->pages[i]); + put_page(ow->pages[i]); + kfree(wrp); + } + end_page_writeback(ow->pages[i]); + unlock_page(ow->pages[i]); + } + } + return ret; +} + +static int orangefs_writepages_callback(struct page *page, + struct writeback_control *wbc, void *data) +{ + struct orangefs_writepages *ow = data; + struct orangefs_write_range *wr; + int ret; + + if (!PagePrivate(page)) { + unlock_page(page); + /* It's not private so there's nothing to write, right? */ + printk("writepages_callback not private!\n"); + BUG(); + return 0; + } + wr = (struct orangefs_write_range *)page_private(page); + + ret = -1; + if (ow->npages == 0) { + ow->off = wr->pos; + ow->len = wr->len; + ow->uid = wr->uid; + ow->gid = wr->gid; + ow->pages[ow->npages++] = page; + ret = 0; + goto done; + } + if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) { + orangefs_writepages_work(ow, wbc); + ow->npages = 0; + ret = -1; + goto done; + } + if (ow->off + ow->len == wr->pos) { + ow->len += wr->len; + ow->pages[ow->npages++] = page; + ret = 0; + goto done; + } +done: + if (ret == -1) { + if (ow->npages) { + orangefs_writepages_work(ow, wbc); + ow->npages = 0; + } + ret = orangefs_writepage_locked(page, wbc); + mapping_set_error(page->mapping, ret); + unlock_page(page); + end_page_writeback(page); + } else { + if (ow->npages == ow->maxpages) { + orangefs_writepages_work(ow, wbc); + ow->npages = 0; + } + } + return ret; +} + +static int orangefs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct orangefs_writepages *ow; + struct blk_plug plug; + int ret; + ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL); + if (!ow) + return -ENOMEM; + ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE; + ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL); + if (!ow->pages) { + kfree(ow); + return -ENOMEM; + } + ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL); + if (!ow->bv) { + kfree(ow->pages); + kfree(ow); + return -ENOMEM; + } + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow); + if (ow->npages) + ret = orangefs_writepages_work(ow, wbc); + blk_finish_plug(&plug); + kfree(ow->pages); + kfree(ow->bv); + kfree(ow); + return ret; +} + static int orangefs_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; @@ -93,6 +261,9 @@ static int orangefs_readpage(struct file *file, struct page *page) bv.bv_offset = 0; iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE); + if (PageDirty(page)) + orangefs_launder_page(page); + ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, PAGE_SIZE, inode->i_size, NULL); /* this will only zero remaining unread portions of the page data */ @@ -170,22 +341,42 @@ static int orangefs_write_begin(struct file *file, set_page_private(page, (unsigned long)wr); get_page(page); okay: - - if (!PageUptodate(page) && (len != PAGE_SIZE)) { - unsigned from = pos & (PAGE_SIZE - 1); - - zero_user_segments(page, 0, from, from + len, PAGE_SIZE); - } return 0; } static int orangefs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - int r; - r = simple_write_end(file, mapping, pos, len, copied, page, fsdata); + struct inode *inode = page->mapping->host; + loff_t last_pos = pos + copied; + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold the i_mutex. + */ + if (last_pos > inode->i_size) + i_size_write(inode, last_pos); + + /* zero the stale part of the page if we did a short copy */ + if (!PageUptodate(page)) { + unsigned from = pos & (PAGE_SIZE - 1); + if (copied < len) { + zero_user(page, from + copied, len - copied); + } + /* Set fully written pages uptodate. */ + if (pos == page_offset(page) && + (len == PAGE_SIZE || pos + len == inode->i_size)) { + zero_user_segment(page, from + copied, PAGE_SIZE); + SetPageUptodate(page); + } + } + + set_page_dirty(page); + unlock_page(page); + put_page(page); + mark_inode_dirty_sync(file_inode(file)); - return r; + return copied; } static void orangefs_invalidatepage(struct page *page, @@ -200,6 +391,7 @@ static void orangefs_invalidatepage(struct page *page, set_page_private(page, 0); ClearPagePrivate(page); put_page(page); + return; /* write range entirely within invalidate range (or equal) */ } else if (page_offset(page) + offset <= wr->pos && wr->pos + wr->len <= page_offset(page) + offset + length) { @@ -209,6 +401,7 @@ static void orangefs_invalidatepage(struct page *page, put_page(page); /* XXX is this right? only caller in fs */ cancel_dirty_page(page); + return; /* invalidate range chops off end of write range */ } else if (wr->pos < page_offset(page) + offset && wr->pos + wr->len <= page_offset(page) + offset + length && @@ -240,6 +433,7 @@ static void orangefs_invalidatepage(struct page *page, * should we just ignore this and write it out anyway? * it hardly makes sense */ + return; /* non-overlapping ranges */ } else { /* WARN if they do overlap */ @@ -251,7 +445,15 @@ static void orangefs_invalidatepage(struct page *page, printk("write range offset %llu length %zu\n", wr->pos, wr->len); } + return; } + + /* + * Above there are returns where wr is freed or where we WARN. + * Thus the following runs if wr was modified above. + */ + + orangefs_launder_page(page); } static int orangefs_releasepage(struct page *page, gfp_t foo) @@ -404,6 +606,7 @@ out: static const struct address_space_operations orangefs_address_operations = { .writepage = orangefs_writepage, .readpage = orangefs_readpage, + .writepages = orangefs_writepages, .set_page_dirty = __set_page_dirty_nobuffers, .write_begin = orangefs_write_begin, .write_end = orangefs_write_end, @@ -418,9 +621,18 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); - vm_fault_t ret = VM_FAULT_LOCKED; + struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); + unsigned long *bitlock = &orangefs_inode->bitlock; + vm_fault_t ret; struct orangefs_write_range *wr; + sb_start_pagefault(inode->i_sb); + + if (wait_on_bit(bitlock, 1, TASK_KILLABLE)) { + ret = VM_FAULT_RETRY; + goto out; + } + lock_page(page); if (PageDirty(page) && !PagePrivate(page)) { /* @@ -429,7 +641,7 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) * orangefs_writepage_locked. */ if (orangefs_launder_page(page)) { - ret = VM_FAULT_RETRY; + ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; goto out; } } @@ -442,14 +654,14 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) goto okay; } else { if (orangefs_launder_page(page)) { - ret = VM_FAULT_RETRY; + ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; goto out; } } } wr = kmalloc(sizeof *wr, GFP_KERNEL); if (!wr) { - ret = VM_FAULT_RETRY; + ret = VM_FAULT_LOCKED|VM_FAULT_RETRY; goto out; } wr->pos = page_offset(page); @@ -461,11 +673,10 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf) get_page(page); okay: - sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); if (page->mapping != inode->i_mapping) { unlock_page(page); - ret = VM_FAULT_NOPAGE; + ret = VM_FAULT_LOCKED|VM_FAULT_NOPAGE; goto out; } @@ -476,6 +687,7 @@ okay: */ set_page_dirty(page); wait_for_stable_page(page); + ret = VM_FAULT_LOCKED; out: sb_end_pagefault(inode->i_sb); return ret; @@ -553,13 +765,15 @@ int __orangefs_setattr(struct inode *inode, struct iattr *iattr) } else { gossip_debug(GOSSIP_UTILS_DEBUG, "User attempted to set sticky bit on non-root directory; returning EINVAL.\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } } if (iattr->ia_mode & (S_ISUID)) { gossip_debug(GOSSIP_UTILS_DEBUG, "Attempting to set setuid bit (not supported); returning EINVAL.\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } } @@ -741,6 +955,8 @@ static int orangefs_set_inode(struct inode *inode, void *data) ORANGEFS_I(inode)->refn.khandle = ref->khandle; ORANGEFS_I(inode)->attr_valid = 0; hash_init(ORANGEFS_I(inode)->xattr_cache); + ORANGEFS_I(inode)->mapping_time = jiffies - 1; + ORANGEFS_I(inode)->bitlock = 0; return 0; } diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 336a3ec0b83e..87beab10326a 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -193,9 +193,11 @@ struct orangefs_inode_s { sector_t last_failed_block_index_read; unsigned long getattr_time; + unsigned long mapping_time; int attr_valid; kuid_t attr_uid; kgid_t attr_gid; + unsigned long bitlock; DECLARE_HASHTABLE(xattr_cache, 4); }; @@ -390,6 +392,7 @@ bool __is_daemon_in_service(void); /* * defined in file.c */ +int orangefs_revalidate_mapping(struct inode *); ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *, struct iov_iter *, size_t, loff_t, struct orangefs_write_range *); ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *, @@ -427,6 +430,7 @@ int orangefs_normalize_to_errno(__s32 error_code); extern struct mutex orangefs_request_mutex; extern int op_timeout_secs; extern int slot_timeout_secs; +extern int orangefs_cache_timeout_msecs; extern int orangefs_dcache_timeout_msecs; extern int orangefs_getattr_timeout_msecs; extern struct list_head orangefs_superblocks; diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index 85ef87245a87..82cf8b3e568b 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -30,6 +30,7 @@ static ulong module_parm_debug_mask; __u64 orangefs_gossip_debug_mask; int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS; int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS; +int orangefs_cache_timeout_msecs = 50; int orangefs_dcache_timeout_msecs = 50; int orangefs_getattr_timeout_msecs = 50; diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c index 19739aaee675..3627ea946402 100644 --- a/fs/orangefs/orangefs-sysfs.c +++ b/fs/orangefs/orangefs-sysfs.c @@ -62,6 +62,14 @@ * Slots are requested and waited for, * the wait times out after slot_timeout_secs. * + * What: /sys/fs/orangefs/cache_timeout_msecs + * Date: Mar 2018 + * Contact: Martin Brandenburg + * Description: + * Time in milliseconds between which + * orangefs_revalidate_mapping will invalidate the page + * cache. + * * What: /sys/fs/orangefs/dcache_timeout_msecs * Date: Jul 2016 * Contact: Martin Brandenburg @@ -221,6 +229,13 @@ static ssize_t sysfs_int_show(struct kobject *kobj, "%d\n", slot_timeout_secs); goto out; + } else if (!strcmp(attr->attr.name, + "cache_timeout_msecs")) { + rc = scnprintf(buf, + PAGE_SIZE, + "%d\n", + orangefs_cache_timeout_msecs); + goto out; } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { rc = scnprintf(buf, @@ -277,6 +292,9 @@ static ssize_t sysfs_int_store(struct kobject *kobj, } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) { rc = kstrtoint(buf, 0, &slot_timeout_secs); goto out; + } else if (!strcmp(attr->attr.name, "cache_timeout_msecs")) { + rc = kstrtoint(buf, 0, &orangefs_cache_timeout_msecs); + goto out; } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) { rc = kstrtoint(buf, 0, &orangefs_dcache_timeout_msecs); goto out; @@ -818,6 +836,9 @@ static struct orangefs_attribute op_timeout_secs_attribute = static struct orangefs_attribute slot_timeout_secs_attribute = __ATTR(slot_timeout_secs, 0664, sysfs_int_show, sysfs_int_store); +static struct orangefs_attribute cache_timeout_msecs_attribute = + __ATTR(cache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); + static struct orangefs_attribute dcache_timeout_msecs_attribute = __ATTR(dcache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store); @@ -861,6 +882,7 @@ static struct orangefs_attribute perf_time_interval_secs_attribute = static struct attribute *orangefs_default_attrs[] = { &op_timeout_secs_attribute.attr, &slot_timeout_secs_attribute.attr, + &cache_timeout_msecs_attribute.attr, &dcache_timeout_msecs_attribute.attr, &getattr_timeout_msecs_attribute.attr, &readahead_count_attribute.attr, From c2549f8c7a28c00facaf911f700c4811cfd6f52b Mon Sep 17 00:00:00 2001 From: Mike Marshall Date: Mon, 25 Mar 2019 15:52:29 -0400 Subject: [PATCH 20/23] orangefs: remember count when reading. Orangefs wins when it can do IO on large (up to four meg) blocks at a time, and looses when it has to do tiny "small io" reads and writes. Accessing Orangefs through the pagecache with the kernel module helps with small io, both reading and writing, a great deal. Readpage generally tries to fetch a page (four k) at a time. We'll let users use "count" (as in read(2) or pread(2) for example) as a knob to control how much data they get from Orangefs at a time and we'll try to use the data to fill extra pagecache pages when we get to ->readpage, hopefully resulting in fewer calls to readpage and Orangefs userspace. We need a way to remember how they set count so that we can still have it available when we get to ->readpage. - We'll use file->private_data to keep track of "count". We'll wrap generic_file_open with orangefs_file_open and initialize private_data to NULL there. - In ->read_iter we have access to both "count" and file, so we'll kmalloc some space onto file->private_data and store "count" there. - We'll kfree file->private_data each time we visit ->flush and reinitialize it to NULL. Signed-off-by: Mike Marshall Signed-off-by: Martin Brandenburg --- fs/orangefs/file.c | 26 +++++++++++++++++++++++++- fs/orangefs/orangefs-kernel.h | 4 ++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index faa5b61cdfd6..74292d31d113 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -286,8 +286,23 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { int ret; + struct orangefs_read_options *ro; + orangefs_stats.reads++; + /* + * Remember how they set "count" in read(2) or pread(2) or whatever - + * users can use count as a knob to control orangefs io size and later + * we can try to help them fill as many pages as possible in readpage. + */ + if (!iocb->ki_filp->private_data) { + iocb->ki_filp->private_data = kmalloc(sizeof *ro, GFP_KERNEL); + if (!iocb->ki_filp->private_data) + return(ENOMEM); + ro = iocb->ki_filp->private_data; + ro->blksiz = iter->count; + } + down_read(&file_inode(iocb->ki_filp)->i_rwsem); ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); if (ret) @@ -556,6 +571,12 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) return rc; } +static int orangefs_file_open(struct inode * inode, struct file *file) +{ + file->private_data = NULL; + return generic_file_open(inode, file); +} + static int orangefs_flush(struct file *file, fl_owner_t id) { /* @@ -569,6 +590,9 @@ static int orangefs_flush(struct file *file, fl_owner_t id) struct inode *inode = file->f_mapping->host; int r; + kfree(file->private_data); + file->private_data = NULL; + if (inode->i_state & I_DIRTY_TIME) { spin_lock(&inode->i_lock); inode->i_state &= ~I_DIRTY_TIME; @@ -591,7 +615,7 @@ const struct file_operations orangefs_file_operations = { .lock = orangefs_lock, .unlocked_ioctl = orangefs_ioctl, .mmap = orangefs_file_mmap, - .open = generic_file_open, + .open = orangefs_file_open, .flush = orangefs_flush, .release = orangefs_file_release, .fsync = orangefs_fsync, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 87beab10326a..3ae2f129b9c7 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -239,6 +239,10 @@ struct orangefs_write_range { kgid_t gid; }; +struct orangefs_read_options { + ssize_t blksiz; +}; + extern struct orangefs_stats orangefs_stats; /* From 4077a0f25b001926f86d35f6236351583bada9a4 Mon Sep 17 00:00:00 2001 From: Mike Marshall Date: Mon, 25 Mar 2019 18:17:10 -0400 Subject: [PATCH 21/23] orangefs: pass slot index back to readpage. When userspace deposits more than a page of data into the shared buffer, we'll need to know which slot it is in when we get back to readpage so that we can try to use the extra data to fill some extra pages. Signed-off-by: Mike Marshall Signed-off-by: Martin Brandenburg --- fs/orangefs/file.c | 2 +- fs/orangefs/inode.c | 8 ++++---- fs/orangefs/orangefs-kernel.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 74292d31d113..68ba5ae7ef5d 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -47,7 +47,7 @@ static int flush_racache(struct inode *inode) */ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, loff_t *offset, struct iov_iter *iter, size_t total_size, - loff_t readahead_size, struct orangefs_write_range *wr) + loff_t readahead_size, struct orangefs_write_range *wr, int *index_return) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 7ed2ea093c4e..cded74edb47c 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -55,7 +55,7 @@ static int orangefs_writepage_locked(struct page *page, iov_iter_bvec(&iter, WRITE, &bv, 1, wlen); ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, - len, wr); + len, wr, NULL); if (ret < 0) { SetPageError(page); mapping_set_error(page->mapping, ret); @@ -126,7 +126,7 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, wr.uid = ow->uid; wr.gid = ow->gid; ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len, - 0, &wr); + 0, &wr, NULL); if (ret < 0) { for (i = 0; i < ow->npages; i++) { SetPageError(ow->pages[i]); @@ -265,7 +265,7 @@ static int orangefs_readpage(struct file *file, struct page *page) orangefs_launder_page(page); ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, - PAGE_SIZE, inode->i_size, NULL); + PAGE_SIZE, inode->i_size, NULL, NULL); /* this will only zero remaining unread portions of the page data */ iov_iter_zero(~0U, &iter); /* takes care of potential aliasing */ @@ -552,7 +552,7 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb, (int)*offset); ret = wait_for_direct_io(type, inode, offset, iter, - each_count, 0, NULL); + each_count, 0, NULL, NULL); gossip_debug(GOSSIP_FILE_DEBUG, "%s(%pU): return from wait_for_io:%d\n", __func__, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 3ae2f129b9c7..572dd29fbd54 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -398,7 +398,7 @@ bool __is_daemon_in_service(void); */ int orangefs_revalidate_mapping(struct inode *); ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *, - struct iov_iter *, size_t, loff_t, struct orangefs_write_range *); + struct iov_iter *, size_t, loff_t, struct orangefs_write_range *, int *); ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *, struct iov_iter *); From dd59a6475c4cf69afac2ade01ab732b7825a2a45 Mon Sep 17 00:00:00 2001 From: Mike Marshall Date: Mon, 25 Mar 2019 18:59:29 -0400 Subject: [PATCH 22/23] orangefs: copy Orangefs-sized blocks into the pagecache if possible. ->readpage looks in file->private_data to try and find out how the userspace program set "count" in read(2) or with "dd bs=" or whatever. ->readpage uses "count" and inode->i_size to calculate how much data Orangefs should deposit in the Orangefs shared buffer, and remembers which slot the data is in. After copying data from the Orangefs shared buffer slot into "the page", readpage tries to increment through the pagecache index and fill as many pages as it can from the extra data in the shared buffer. Hopefully these extra pages will soon be needed by the vfs, and they'll be in the pagecache already. Signed-off-by: Mike Marshall Signed-off-by: Martin Brandenburg --- fs/orangefs/file.c | 37 +++++++++-- fs/orangefs/inode.c | 115 ++++++++++++++++++++++++++++++--- fs/orangefs/orangefs-bufmap.c | 13 ++++ fs/orangefs/orangefs-bufmap.h | 2 + fs/orangefs/orangefs-debugfs.c | 4 +- 5 files changed, 156 insertions(+), 15 deletions(-) diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 68ba5ae7ef5d..a35c17017210 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -54,6 +54,7 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, struct orangefs_kernel_op_s *new_op = NULL; int buffer_index = -1; ssize_t ret; + size_t copy_amount; new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); if (!new_op) @@ -212,8 +213,25 @@ populate_shared_memory: * can futher be kernel-space or user-space addresses. * or it can pointers to struct page's */ + + /* + * When reading, readahead_size will only be zero when + * we're doing O_DIRECT, otherwise we got here from + * orangefs_readpage. + * + * If we got here from orangefs_readpage we want to + * copy either a page or the whole file into the io + * vector, whichever is smaller. + */ + if (readahead_size) + copy_amount = + min(new_op->downcall.resp.io.amt_complete, + (__s64)PAGE_SIZE); + else + copy_amount = new_op->downcall.resp.io.amt_complete; + ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index, - new_op->downcall.resp.io.amt_complete); + copy_amount); if (ret < 0) { gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n", __func__, (long)ret); @@ -231,10 +249,19 @@ populate_shared_memory: out: if (buffer_index >= 0) { - orangefs_bufmap_put(buffer_index); - gossip_debug(GOSSIP_FILE_DEBUG, - "%s(%pU): PUT buffer_index %d\n", - __func__, handle, buffer_index); + if ((readahead_size) && (type == ORANGEFS_IO_READ)) { + /* readpage */ + *index_return = buffer_index; + gossip_debug(GOSSIP_FILE_DEBUG, + "%s: hold on to buffer_index :%d:\n", + __func__, buffer_index); + } else { + /* O_DIRECT */ + orangefs_bufmap_put(buffer_index); + gossip_debug(GOSSIP_FILE_DEBUG, + "%s(%pU): PUT buffer_index %d\n", + __func__, handle, buffer_index); + } buffer_index = -1; } op_release(new_op); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index cded74edb47c..3fb671dab81d 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -247,31 +247,80 @@ static int orangefs_writepages(struct address_space *mapping, return ret; } +static int orangefs_launder_page(struct page *); + static int orangefs_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; struct iov_iter iter; struct bio_vec bv; ssize_t ret; - loff_t off; + loff_t off; /* offset into this page */ + pgoff_t index; /* which page */ + struct page *next_page; + char *kaddr; + struct orangefs_read_options *ro = file->private_data; + loff_t read_size; + loff_t roundedup; + int buffer_index = -1; /* orangefs shared memory slot */ + int slot_index; /* index into slot */ + int remaining; + + /* + * If they set some miniscule size for "count" in read(2) + * (for example) then let's try to read a page, or the whole file + * if it is smaller than a page. Once "count" goes over a page + * then lets round up to the highest page size multiple that is + * less than or equal to "count" and do that much orangefs IO and + * try to fill as many pages as we can from it. + * + * "count" should be represented in ro->blksiz. + * + * inode->i_size = file size. + */ + if (ro) { + if (ro->blksiz < PAGE_SIZE) { + if (inode->i_size < PAGE_SIZE) + read_size = inode->i_size; + else + read_size = PAGE_SIZE; + } else { + roundedup = ((PAGE_SIZE - 1) & ro->blksiz) ? + ((ro->blksiz + PAGE_SIZE) & ~(PAGE_SIZE -1)) : + ro->blksiz; + if (roundedup > inode->i_size) + read_size = inode->i_size; + else + read_size = roundedup; + + } + } else { + read_size = PAGE_SIZE; + } + if (!read_size) + read_size = PAGE_SIZE; + + if (PageDirty(page)) + orangefs_launder_page(page); off = page_offset(page); + index = off >> PAGE_SHIFT; bv.bv_page = page; bv.bv_len = PAGE_SIZE; bv.bv_offset = 0; iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE); - if (PageDirty(page)) - orangefs_launder_page(page); - ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter, - PAGE_SIZE, inode->i_size, NULL, NULL); + read_size, inode->i_size, NULL, &buffer_index); + remaining = ret; /* this will only zero remaining unread portions of the page data */ iov_iter_zero(~0U, &iter); /* takes care of potential aliasing */ flush_dcache_page(page); if (ret < 0) { SetPageError(page); + unlock_page(page); + goto out; } else { SetPageUptodate(page); if (PageError(page)) @@ -280,11 +329,62 @@ static int orangefs_readpage(struct file *file, struct page *page) } /* unlock the page after the ->readpage() routine completes */ unlock_page(page); + + if (remaining > PAGE_SIZE) { + slot_index = 0; + while ((remaining - PAGE_SIZE) >= PAGE_SIZE) { + remaining -= PAGE_SIZE; + /* + * It is an optimization to try and fill more than one + * page... by now we've already gotten the single + * page we were after, if stuff doesn't seem to + * be going our way at this point just return + * and hope for the best. + * + * If we look for pages and they're already there is + * one reason to give up, and if they're not there + * and we can't create them is another reason. + */ + + index++; + slot_index++; + next_page = find_get_page(inode->i_mapping, index); + if (next_page) { + gossip_debug(GOSSIP_FILE_DEBUG, + "%s: found next page, quitting\n", + __func__); + put_page(next_page); + goto out; + } + next_page = find_or_create_page(inode->i_mapping, + index, + GFP_KERNEL); + /* + * I've never hit this, leave it as a printk for + * now so it will be obvious. + */ + if (!next_page) { + printk("%s: can't create next page, quitting\n", + __func__); + goto out; + } + kaddr = kmap_atomic(next_page); + orangefs_bufmap_page_fill(kaddr, + buffer_index, + slot_index); + kunmap_atomic(kaddr); + SetPageUptodate(next_page); + unlock_page(next_page); + put_page(next_page); + } + } + +out: + if (buffer_index != -1) + orangefs_bufmap_put(buffer_index); return ret; } -static int orangefs_launder_page(struct page *); - static int orangefs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, @@ -326,7 +426,6 @@ static int orangefs_write_begin(struct file *file, if (ret) return ret; } - } wr = kmalloc(sizeof *wr, GFP_KERNEL); diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 443bcd8c3c19..d4811f981608 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -538,3 +538,16 @@ int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter, } return 0; } + +void orangefs_bufmap_page_fill(void *page_to, + int buffer_index, + int slot_index) +{ + struct orangefs_bufmap_desc *from; + void *page_from; + + from = &__orangefs_bufmap->desc_array[buffer_index]; + page_from = kmap_atomic(from->page_array[slot_index]); + memcpy(page_to, page_from, PAGE_SIZE); + kunmap_atomic(page_from); +} diff --git a/fs/orangefs/orangefs-bufmap.h b/fs/orangefs/orangefs-bufmap.h index c2c3c5a0eeab..75b2d2833af1 100644 --- a/fs/orangefs/orangefs-bufmap.h +++ b/fs/orangefs/orangefs-bufmap.h @@ -34,4 +34,6 @@ int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter, int buffer_index, size_t size); +void orangefs_bufmap_page_fill(void *kaddr, int buffer_index, int slot_index); + #endif /* __ORANGEFS_BUFMAP_H */ diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 0732cb08173e..87b1a6fce628 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -963,7 +963,7 @@ int orangefs_debugfs_new_client_mask(void __user *arg) return ret; } -int orangefs_debugfs_new_client_string(void __user *arg) +int orangefs_debugfs_new_client_string(void __user *arg) { int ret; @@ -1016,7 +1016,7 @@ int orangefs_debugfs_new_client_string(void __user *arg) return 0; } -int orangefs_debugfs_new_debug(void __user *arg) +int orangefs_debugfs_new_debug(void __user *arg) { struct dev_mask_info_s mask_info = {0}; int ret; From 33713cd09ccdc1e01b10d0782ae60200d4989553 Mon Sep 17 00:00:00 2001 From: Martin Brandenburg Date: Mon, 29 Apr 2019 17:09:48 +0000 Subject: [PATCH 23/23] orangefs: truncate before updating size Otherwise we race with orangefs_writepage/orangefs_writepages which and does not expect i_size < page_offset. Fixes xfstests generic/129. Signed-off-by: Martin Brandenburg Signed-off-by: Mike Marshall --- fs/orangefs/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 3fb671dab81d..0c337d8bdaab 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -818,7 +818,11 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) } orig_size = i_size_read(inode); - truncate_setsize(inode, iattr->ia_size); + /* This is truncate_setsize in a different order. */ + truncate_pagecache(inode, iattr->ia_size); + i_size_write(inode, iattr->ia_size); + if (iattr->ia_size > orig_size) + pagecache_isize_extended(inode, orig_size, iattr->ia_size); new_op = op_alloc(ORANGEFS_VFS_OP_TRUNCATE); if (!new_op)