From f2ebb3a921c1ca1e2ddd9242e95a1989a50c4c68 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 27 Feb 2014 09:35:45 -0500 Subject: [PATCH 01/63] smarter propagate_mnt() The current mainline has copies propagated to *all* nodes, then tears down the copies we made for nodes that do not contain counterparts of the desired mountpoint. That sets the right propagation graph for the copies (at teardown time we move the slaves of removed node to a surviving peer or directly to master), but we end up paying a fairly steep price in useless allocations. It's fairly easy to create a situation where N calls of mount(2) create exactly N bindings, with O(N^2) vfsmounts allocated and freed in process. Fortunately, it is possible to avoid those allocations/freeings. The trick is to create copies in the right order and find which one would've eventually become a master with the current algorithm. It turns out to be possible in O(nodes getting propagation) time and with no extra allocations at all. One part is that we need to make sure that eventual master will be created before its slaves, so we need to walk the propagation tree in a different order - by peer groups. And iterate through the peers before dealing with the next group. Another thing is finding the (earlier) copy that will be a master of one we are about to create; to do that we are (temporary) marking the masters of mountpoints we are attaching the copies to. Either we are in a peer of the last mountpoint we'd dealt with, or we have the following situation: we are attaching to mountpoint M, the last copy S_0 had been attached to M_0 and there are sequences S_0...S_n, M_0...M_n such that S_{i+1} is a master of S_{i}, S_{i} mounted on M{i} and we need to create a slave of the first S_{k} such that M is getting propagation from M_{k}. It means that the master of M_{k} will be among the sequence of masters of M. On the other hand, the nearest marked node in that sequence will either be the master of M_{k} or the master of M_{k-1} (the latter - in the case if M_{k-1} is a slave of something M gets propagation from, but in a wrong peer group). So we go through the sequence of masters of M until we find a marked one (P). Let N be the one before it. Then we go through the sequence of masters of S_0 until we find one (say, S) mounted on a node D that has P as master and check if D is a peer of N. If it is, S will be the master of new copy, if not - the master of S will be. That's it for the hard part; the rest is fairly simple. Iterator is in next_group(), handling of one prospective mountpoint is propagate_one(). It seems to survive all tests and gives a noticably better performance than the current mainline for setups that are seriously using shared subtrees. Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/namespace.c | 11 ++- fs/pnode.c | 198 +++++++++++++++++++++++++----------------- fs/pnode.h | 3 + include/linux/mount.h | 3 + 4 files changed, 133 insertions(+), 82 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 2ffc5a2905d4..65233a5f390a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -885,7 +885,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, goto out_free; } - mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~MNT_WRITE_HOLD; + mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); /* Don't allow unprivileged users to change mount flags */ if ((flag & CL_UNPRIVILEGED) && (mnt->mnt.mnt_flags & MNT_READONLY)) mnt->mnt.mnt_flags |= MNT_LOCK_READONLY; @@ -1661,9 +1661,9 @@ static int attach_recursive_mnt(struct mount *source_mnt, if (err) goto out; err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); + lock_mount_hash(); if (err) goto out_cleanup_ids; - lock_mount_hash(); for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); } else { @@ -1690,6 +1690,11 @@ static int attach_recursive_mnt(struct mount *source_mnt, return 0; out_cleanup_ids: + while (!hlist_empty(&tree_list)) { + child = hlist_entry(tree_list.first, struct mount, mnt_hash); + umount_tree(child, 0); + } + unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); out: return err; @@ -2044,7 +2049,7 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) struct mount *parent; int err; - mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | MNT_DOOMED | MNT_SYNC_UMOUNT); + mnt_flags &= ~MNT_INTERNAL_FLAGS; mp = lock_mount(path); if (IS_ERR(mp)) diff --git a/fs/pnode.c b/fs/pnode.c index 88396df725b4..302bf22c4a30 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -164,46 +164,94 @@ static struct mount *propagation_next(struct mount *m, } } -/* - * return the source mount to be used for cloning - * - * @dest the current destination mount - * @last_dest the last seen destination mount - * @last_src the last seen source mount - * @type return CL_SLAVE if the new mount has to be - * cloned as a slave. - */ -static struct mount *get_source(struct mount *dest, - struct mount *last_dest, - struct mount *last_src, - int *type) +static struct mount *next_group(struct mount *m, struct mount *origin) { - struct mount *p_last_src = NULL; - struct mount *p_last_dest = NULL; - - while (last_dest != dest->mnt_master) { - p_last_dest = last_dest; - p_last_src = last_src; - last_dest = last_dest->mnt_master; - last_src = last_src->mnt_master; - } - - if (p_last_dest) { - do { - p_last_dest = next_peer(p_last_dest); - } while (IS_MNT_NEW(p_last_dest)); - /* is that a peer of the earlier? */ - if (dest == p_last_dest) { - *type = CL_MAKE_SHARED; - return p_last_src; + while (1) { + while (1) { + struct mount *next; + if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) + return first_slave(m); + next = next_peer(m); + if (m->mnt_group_id == origin->mnt_group_id) { + if (next == origin) + return NULL; + } else if (m->mnt_slave.next != &next->mnt_slave) + break; + m = next; } + /* m is the last peer */ + while (1) { + struct mount *master = m->mnt_master; + if (m->mnt_slave.next != &master->mnt_slave_list) + return next_slave(m); + m = next_peer(master); + if (master->mnt_group_id == origin->mnt_group_id) + break; + if (master->mnt_slave.next == &m->mnt_slave) + break; + m = master; + } + if (m == origin) + return NULL; } - /* slave of the earlier, then */ - *type = CL_SLAVE; - /* beginning of peer group among the slaves? */ - if (IS_MNT_SHARED(dest)) - *type |= CL_MAKE_SHARED; - return last_src; +} + +/* all accesses are serialized by namespace_sem */ +static struct user_namespace *user_ns; +static struct mount *last_dest, *last_source, *dest_master; +static struct mountpoint *mp; +static struct hlist_head *list; + +static int propagate_one(struct mount *m) +{ + struct mount *child; + int type; + /* skip ones added by this propagate_mnt() */ + if (IS_MNT_NEW(m)) + return 0; + /* skip if mountpoint isn't covered by it */ + if (!is_subdir(mp->m_dentry, m->mnt.mnt_root)) + return 0; + if (m->mnt_group_id == last_dest->mnt_group_id) { + type = CL_MAKE_SHARED; + } else { + struct mount *n, *p; + for (n = m; ; n = p) { + p = n->mnt_master; + if (p == dest_master || IS_MNT_MARKED(p)) { + while (last_dest->mnt_master != p) { + last_source = last_source->mnt_master; + last_dest = last_source->mnt_parent; + } + if (n->mnt_group_id != last_dest->mnt_group_id) { + last_source = last_source->mnt_master; + last_dest = last_source->mnt_parent; + } + break; + } + } + type = CL_SLAVE; + /* beginning of peer group among the slaves? */ + if (IS_MNT_SHARED(m)) + type |= CL_MAKE_SHARED; + } + + /* Notice when we are propagating across user namespaces */ + if (m->mnt_ns->user_ns != user_ns) + type |= CL_UNPRIVILEGED; + child = copy_tree(last_source, last_source->mnt.mnt_root, type); + if (IS_ERR(child)) + return PTR_ERR(child); + mnt_set_mountpoint(m, mp, child); + last_dest = m; + last_source = child; + if (m->mnt_master != dest_master) { + read_seqlock_excl(&mount_lock); + SET_MNT_MARK(m->mnt_master); + read_sequnlock_excl(&mount_lock); + } + hlist_add_head(&child->mnt_hash, list); + return 0; } /* @@ -222,56 +270,48 @@ static struct mount *get_source(struct mount *dest, int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, struct mount *source_mnt, struct hlist_head *tree_list) { - struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; - struct mount *m, *child; + struct mount *m, *n; int ret = 0; - struct mount *prev_dest_mnt = dest_mnt; - struct mount *prev_src_mnt = source_mnt; - HLIST_HEAD(tmp_list); - for (m = propagation_next(dest_mnt, dest_mnt); m; - m = propagation_next(m, dest_mnt)) { - int type; - struct mount *source; + /* + * we don't want to bother passing tons of arguments to + * propagate_one(); everything is serialized by namespace_sem, + * so globals will do just fine. + */ + user_ns = current->nsproxy->mnt_ns->user_ns; + last_dest = dest_mnt; + last_source = source_mnt; + mp = dest_mp; + list = tree_list; + dest_master = dest_mnt->mnt_master; - if (IS_MNT_NEW(m)) - continue; - - source = get_source(m, prev_dest_mnt, prev_src_mnt, &type); - - /* Notice when we are propagating across user namespaces */ - if (m->mnt_ns->user_ns != user_ns) - type |= CL_UNPRIVILEGED; - - child = copy_tree(source, source->mnt.mnt_root, type); - if (IS_ERR(child)) { - ret = PTR_ERR(child); - tmp_list = *tree_list; - tmp_list.first->pprev = &tmp_list.first; - INIT_HLIST_HEAD(tree_list); + /* all peers of dest_mnt, except dest_mnt itself */ + for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) { + ret = propagate_one(n); + if (ret) goto out; - } + } - if (is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) { - mnt_set_mountpoint(m, dest_mp, child); - hlist_add_head(&child->mnt_hash, tree_list); - } else { - /* - * This can happen if the parent mount was bind mounted - * on some subdirectory of a shared/slave mount. - */ - hlist_add_head(&child->mnt_hash, &tmp_list); - } - prev_dest_mnt = m; - prev_src_mnt = child; + /* all slave groups */ + for (m = next_group(dest_mnt, dest_mnt); m; + m = next_group(m, dest_mnt)) { + /* everything in that slave group */ + n = m; + do { + ret = propagate_one(n); + if (ret) + goto out; + n = next_peer(n); + } while (n != m); } out: - lock_mount_hash(); - while (!hlist_empty(&tmp_list)) { - child = hlist_entry(tmp_list.first, struct mount, mnt_hash); - umount_tree(child, 0); + read_seqlock_excl(&mount_lock); + hlist_for_each_entry(n, tree_list, mnt_hash) { + m = n->mnt_parent; + if (m->mnt_master != dest_mnt->mnt_master) + CLEAR_MNT_MARK(m->mnt_master); } - unlock_mount_hash(); + read_sequnlock_excl(&mount_lock); return ret; } diff --git a/fs/pnode.h b/fs/pnode.h index fc28a27fa892..4a246358b031 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -16,6 +16,9 @@ #define IS_MNT_NEW(m) (!(m)->mnt_ns) #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) +#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) +#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED) +#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED) #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 diff --git a/include/linux/mount.h b/include/linux/mount.h index 371d346fa270..839bac270904 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -44,6 +44,8 @@ struct mnt_namespace; #define MNT_SHARED_MASK (MNT_UNBINDABLE) #define MNT_PROPAGATION_MASK (MNT_SHARED | MNT_UNBINDABLE) +#define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) #define MNT_INTERNAL 0x4000 @@ -51,6 +53,7 @@ struct mnt_namespace; #define MNT_LOCKED 0x800000 #define MNT_DOOMED 0x1000000 #define MNT_SYNC_UMOUNT 0x2000000 +#define MNT_MARKED 0x4000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ From c7999c3627bc6d49aa6fb9451063938cfd2c2082 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 27 Feb 2014 14:40:10 -0500 Subject: [PATCH 02/63] reduce m_start() cost... Signed-off-by: Al Viro --- fs/mount.h | 5 ++++- fs/namespace.c | 21 ++++++++++++++++++--- fs/proc_namespace.c | 1 + 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/fs/mount.h b/fs/mount.h index b29e42f05f34..d55297f2fa05 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -10,7 +10,7 @@ struct mnt_namespace { struct user_namespace *user_ns; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; - int event; + u64 event; }; struct mnt_pcp { @@ -104,6 +104,9 @@ struct proc_mounts { struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); + void *cached_mount; + u64 cached_event; + loff_t cached_index; }; #define proc_mounts(p) (container_of((p), struct proc_mounts, m)) diff --git a/fs/namespace.c b/fs/namespace.c index 65233a5f390a..a66aff5bd3fe 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -52,7 +52,7 @@ static int __init set_mphash_entries(char *str) } __setup("mphash_entries=", set_mphash_entries); -static int event; +static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); static DEFINE_SPINLOCK(mnt_id_lock); @@ -1100,14 +1100,29 @@ static void *m_start(struct seq_file *m, loff_t *pos) struct proc_mounts *p = proc_mounts(m); down_read(&namespace_sem); - return seq_list_start(&p->ns->list, *pos); + if (p->cached_event == p->ns->event) { + void *v = p->cached_mount; + if (*pos == p->cached_index) + return v; + if (*pos == p->cached_index + 1) { + v = seq_list_next(v, &p->ns->list, &p->cached_index); + return p->cached_mount = v; + } + } + + p->cached_event = p->ns->event; + p->cached_mount = seq_list_start(&p->ns->list, *pos); + p->cached_index = *pos; + return p->cached_mount; } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct proc_mounts *p = proc_mounts(m); - return seq_list_next(v, &p->ns->list, pos); + p->cached_mount = seq_list_next(v, &p->ns->list, pos); + p->cached_index = *pos; + return p->cached_mount; } static void m_stop(struct seq_file *m, void *v) diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 7be26f03a3f5..1a81373947f3 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -267,6 +267,7 @@ static int mounts_open_common(struct inode *inode, struct file *file, p->root = root; p->m.poll_event = ns->event; p->show = show; + p->cached_event = ~0ULL; return 0; From 964ea96ebaa4014a3d4bc9d6950b460899e3814d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Mar 2014 20:33:08 -0500 Subject: [PATCH 03/63] usbip: don't open-code sockfd_lookup/sockfd_put Signed-off-by: Al Viro --- drivers/staging/usbip/stub_dev.c | 8 ++++---- drivers/staging/usbip/usbip_common.c | 25 ------------------------- drivers/staging/usbip/usbip_common.h | 1 - drivers/staging/usbip/vhci_hcd.c | 4 ++-- drivers/staging/usbip/vhci_sysfs.c | 6 +++--- 5 files changed, 9 insertions(+), 35 deletions(-) diff --git a/drivers/staging/usbip/stub_dev.c b/drivers/staging/usbip/stub_dev.c index 76a1ff0e6275..2e2ccefb9c2b 100644 --- a/drivers/staging/usbip/stub_dev.c +++ b/drivers/staging/usbip/stub_dev.c @@ -86,7 +86,6 @@ static ssize_t store_sockfd(struct device *dev, struct device_attribute *attr, struct stub_device *sdev = dev_get_drvdata(dev); int sockfd = 0; struct socket *socket; - ssize_t err = -EINVAL; if (!sdev) { dev_err(dev, "sdev is null\n"); @@ -96,6 +95,7 @@ static ssize_t store_sockfd(struct device *dev, struct device_attribute *attr, sscanf(buf, "%d", &sockfd); if (sockfd != -1) { + int err; dev_info(dev, "stub up\n"); spin_lock_irq(&sdev->ud.lock); @@ -105,7 +105,7 @@ static ssize_t store_sockfd(struct device *dev, struct device_attribute *attr, goto err; } - socket = sockfd_to_socket(sockfd); + socket = sockfd_lookup(sockfd, &err); if (!socket) goto err; @@ -138,7 +138,7 @@ static ssize_t store_sockfd(struct device *dev, struct device_attribute *attr, err: spin_unlock_irq(&sdev->ud.lock); - return err; + return -EINVAL; } static DEVICE_ATTR(usbip_sockfd, S_IWUSR, NULL, store_sockfd); @@ -208,7 +208,7 @@ static void stub_shutdown_connection(struct usbip_device *ud) * not touch NULL socket. */ if (ud->tcp_socket) { - fput(ud->tcp_socket->file); + sockfd_put(ud->tcp_socket); ud->tcp_socket = NULL; } diff --git a/drivers/staging/usbip/usbip_common.c b/drivers/staging/usbip/usbip_common.c index 96552e3a1bfb..e010939ebb12 100644 --- a/drivers/staging/usbip/usbip_common.c +++ b/drivers/staging/usbip/usbip_common.c @@ -400,31 +400,6 @@ err: } EXPORT_SYMBOL_GPL(usbip_recv); -struct socket *sockfd_to_socket(unsigned int sockfd) -{ - struct socket *socket; - struct file *file; - struct inode *inode; - - file = fget(sockfd); - if (!file) { - pr_err("invalid sockfd\n"); - return NULL; - } - - inode = file_inode(file); - - if (!inode || !S_ISSOCK(inode->i_mode)) { - fput(file); - return NULL; - } - - socket = SOCKET_I(inode); - - return socket; -} -EXPORT_SYMBOL_GPL(sockfd_to_socket); - /* there may be more cases to tweak the flags. */ static unsigned int tweak_transfer_flags(unsigned int flags) { diff --git a/drivers/staging/usbip/usbip_common.h b/drivers/staging/usbip/usbip_common.h index 7e6c5436d972..9f86588a4534 100644 --- a/drivers/staging/usbip/usbip_common.h +++ b/drivers/staging/usbip/usbip_common.h @@ -314,7 +314,6 @@ void usbip_dump_urb(struct urb *purb); void usbip_dump_header(struct usbip_header *pdu); int usbip_recv(struct socket *sock, void *buf, int size); -struct socket *sockfd_to_socket(unsigned int sockfd); void usbip_pack_pdu(struct usbip_header *pdu, struct urb *urb, int cmd, int pack); diff --git a/drivers/staging/usbip/vhci_hcd.c b/drivers/staging/usbip/vhci_hcd.c index 72391ef87646..99dd2b1656c9 100644 --- a/drivers/staging/usbip/vhci_hcd.c +++ b/drivers/staging/usbip/vhci_hcd.c @@ -789,7 +789,7 @@ static void vhci_shutdown_connection(struct usbip_device *ud) /* active connection is closed */ if (vdev->ud.tcp_socket) { - fput(vdev->ud.tcp_socket->file); + sockfd_put(vdev->ud.tcp_socket); vdev->ud.tcp_socket = NULL; } pr_info("release socket\n"); @@ -836,7 +836,7 @@ static void vhci_device_reset(struct usbip_device *ud) vdev->udev = NULL; if (ud->tcp_socket) { - fput(ud->tcp_socket->file); + sockfd_put(ud->tcp_socket); ud->tcp_socket = NULL; } ud->status = VDEV_ST_NULL; diff --git a/drivers/staging/usbip/vhci_sysfs.c b/drivers/staging/usbip/vhci_sysfs.c index 0141bc34d5cc..baba127081b3 100644 --- a/drivers/staging/usbip/vhci_sysfs.c +++ b/drivers/staging/usbip/vhci_sysfs.c @@ -175,6 +175,7 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr, struct socket *socket; int sockfd = 0; __u32 rhport = 0, devid = 0, speed = 0; + int err; /* * @rhport: port number of vhci_hcd @@ -192,8 +193,7 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr, return -EINVAL; /* Extract socket from fd. */ - /* The correct way to clean this up is to fput(socket->file). */ - socket = sockfd_to_socket(sockfd); + socket = sockfd_lookup(sockfd, &err); if (!socket) return -EINVAL; @@ -209,7 +209,7 @@ static ssize_t store_attach(struct device *dev, struct device_attribute *attr, spin_unlock(&vdev->ud.lock); spin_unlock(&the_controller->lock); - fput(socket->file); + sockfd_put(socket); dev_err(dev, "port %d already used\n", rhport); return -EINVAL; From 09aaacf02a3e88870ed5cad038a5bc822c947904 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Mar 2014 20:39:00 -0500 Subject: [PATCH 04/63] vhost: don't open-code sockfd_put() Signed-off-by: Al Viro --- drivers/vhost/net.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index e1e22e0f01e8..be414d2b2b22 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -818,9 +818,9 @@ static int vhost_net_release(struct inode *inode, struct file *f) vhost_dev_cleanup(&n->dev, false); vhost_net_vq_reset(n); if (tx_sock) - fput(tx_sock->file); + sockfd_put(tx_sock); if (rx_sock) - fput(rx_sock->file); + sockfd_put(rx_sock); /* Make sure no callbacks are outstanding */ synchronize_rcu_bh(); /* We do an extra flush before freeing memory, @@ -860,7 +860,7 @@ static struct socket *get_raw_socket(int fd) } return sock; err: - fput(sock->file); + sockfd_put(sock); return ERR_PTR(r); } @@ -966,7 +966,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) if (oldsock) { vhost_net_flush_vq(n, index); - fput(oldsock->file); + sockfd_put(oldsock); } mutex_unlock(&n->dev.mutex); @@ -978,7 +978,7 @@ err_used: if (ubufs) vhost_net_ubuf_put_wait_and_free(ubufs); err_ubufs: - fput(sock->file); + sockfd_put(sock); err_vq: mutex_unlock(&vq->mutex); err: @@ -1009,9 +1009,9 @@ static long vhost_net_reset_owner(struct vhost_net *n) done: mutex_unlock(&n->dev.mutex); if (tx_sock) - fput(tx_sock->file); + sockfd_put(tx_sock); if (rx_sock) - fput(rx_sock->file); + sockfd_put(rx_sock); return err; } From e25115786ee540fc428a14872ebd4f56252aba32 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Mar 2014 20:41:36 -0500 Subject: [PATCH 05/63] switch nbd to sockfd_lookup/sockfd_put Signed-off-by: Al Viro --- drivers/block/nbd.c | 48 ++++++++++++++++++--------------------------- include/linux/nbd.h | 3 +-- 2 files changed, 20 insertions(+), 31 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 55298db36b2d..3a70ea2f7cd6 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -630,37 +630,29 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, } case NBD_CLEAR_SOCK: { - struct file *file; - + struct socket *sock = nbd->sock; nbd->sock = NULL; - file = nbd->file; - nbd->file = NULL; nbd_clear_que(nbd); BUG_ON(!list_empty(&nbd->queue_head)); BUG_ON(!list_empty(&nbd->waiting_queue)); kill_bdev(bdev); - if (file) - fput(file); + if (sock) + sockfd_put(sock); return 0; } case NBD_SET_SOCK: { - struct file *file; - if (nbd->file) + struct socket *sock; + int err; + if (nbd->sock) return -EBUSY; - file = fget(arg); - if (file) { - struct inode *inode = file_inode(file); - if (S_ISSOCK(inode->i_mode)) { - nbd->file = file; - nbd->sock = SOCKET_I(inode); - if (max_part > 0) - bdev->bd_invalidated = 1; - nbd->disconnect = 0; /* we're connected now */ - return 0; - } else { - fput(file); - } + sock = sockfd_lookup(arg, &err); + if (sock) { + nbd->sock = sock; + if (max_part > 0) + bdev->bd_invalidated = 1; + nbd->disconnect = 0; /* we're connected now */ + return 0; } return -EINVAL; } @@ -697,12 +689,12 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, case NBD_DO_IT: { struct task_struct *thread; - struct file *file; + struct socket *sock; int error; if (nbd->pid) return -EBUSY; - if (!nbd->file) + if (!nbd->sock) return -EINVAL; mutex_unlock(&nbd->tx_lock); @@ -731,15 +723,15 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, if (error) return error; sock_shutdown(nbd, 0); - file = nbd->file; - nbd->file = NULL; + sock = nbd->sock; + nbd->sock = NULL; nbd_clear_que(nbd); dev_warn(disk_to_dev(nbd->disk), "queue cleared\n"); kill_bdev(bdev); queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); set_device_ro(bdev, false); - if (file) - fput(file); + if (sock) + sockfd_put(sock); nbd->flags = 0; nbd->bytesize = 0; bdev->bd_inode->i_size = 0; @@ -875,9 +867,7 @@ static int __init nbd_init(void) for (i = 0; i < nbds_max; i++) { struct gendisk *disk = nbd_dev[i].disk; - nbd_dev[i].file = NULL; nbd_dev[i].magic = NBD_MAGIC; - nbd_dev[i].flags = 0; INIT_LIST_HEAD(&nbd_dev[i].waiting_queue); spin_lock_init(&nbd_dev[i].queue_lock); INIT_LIST_HEAD(&nbd_dev[i].queue_head); diff --git a/include/linux/nbd.h b/include/linux/nbd.h index ae4981ebd18e..f62f78aef4ac 100644 --- a/include/linux/nbd.h +++ b/include/linux/nbd.h @@ -24,8 +24,7 @@ struct request; struct nbd_device { int flags; int harderror; /* Code of hard error */ - struct socket * sock; - struct file * file; /* If == NULL, device is not ready, yet */ + struct socket * sock; /* If == NULL, device is not ready, yet */ int magic; spinlock_t queue_lock; From 44ba8406d0005400e56c6b6a279a669eb761d1b8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 6 Mar 2014 17:41:01 -0500 Subject: [PATCH 06/63] ncpfs: switch to sockfd_lookup()/sockfd_put() Signed-off-by: Al Viro --- fs/ncpfs/inode.c | 50 +++++++++++--------------------------------- fs/ncpfs/ncp_fs_sb.h | 2 -- 2 files changed, 12 insertions(+), 40 deletions(-) diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 2cf2ebecb55f..ceeca64f0599 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -468,9 +468,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) { struct ncp_mount_data_kernel data; struct ncp_server *server; - struct file *ncp_filp; struct inode *root_inode; - struct inode *sock_inode; struct socket *sock; int error; int default_bufsize; @@ -539,18 +537,10 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (!uid_valid(data.mounted_uid) || !uid_valid(data.uid) || !gid_valid(data.gid)) goto out; - error = -EBADF; - ncp_filp = fget(data.ncp_fd); - if (!ncp_filp) - goto out; - error = -ENOTSOCK; - sock_inode = file_inode(ncp_filp); - if (!S_ISSOCK(sock_inode->i_mode)) - goto out_fput; - sock = SOCKET_I(sock_inode); + sock = sockfd_lookup(data.ncp_fd, &error); if (!sock) - goto out_fput; - + goto out; + if (sock->type == SOCK_STREAM) default_bufsize = 0xF000; else @@ -572,27 +562,16 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (error) goto out_fput; - server->ncp_filp = ncp_filp; server->ncp_sock = sock; if (data.info_fd != -1) { - struct socket *info_sock; - - error = -EBADF; - server->info_filp = fget(data.info_fd); - if (!server->info_filp) - goto out_bdi; - error = -ENOTSOCK; - sock_inode = file_inode(server->info_filp); - if (!S_ISSOCK(sock_inode->i_mode)) - goto out_fput2; - info_sock = SOCKET_I(sock_inode); + struct socket *info_sock = sockfd_lookup(data.info_fd, &error); if (!info_sock) - goto out_fput2; + goto out_bdi; + server->info_sock = info_sock; error = -EBADFD; if (info_sock->type != SOCK_STREAM) goto out_fput2; - server->info_sock = info_sock; } /* server->lock = 0; */ @@ -764,17 +743,12 @@ out_nls: mutex_destroy(&server->root_setup_lock); mutex_destroy(&server->mutex); out_fput2: - if (server->info_filp) - fput(server->info_filp); + if (server->info_sock) + sockfd_put(server->info_sock); out_bdi: bdi_destroy(&server->bdi); out_fput: - /* 23/12/1998 Marcin Dalecki : - * - * The previously used put_filp(ncp_filp); was bogus, since - * it doesn't perform proper unlocking. - */ - fput(ncp_filp); + sockfd_put(sock); out: put_pid(data.wdog_pid); sb->s_fs_info = NULL; @@ -807,9 +781,9 @@ static void ncp_put_super(struct super_block *sb) mutex_destroy(&server->root_setup_lock); mutex_destroy(&server->mutex); - if (server->info_filp) - fput(server->info_filp); - fput(server->ncp_filp); + if (server->info_sock) + sockfd_put(server->info_sock); + sockfd_put(server->ncp_sock); kill_pid(server->m.wdog_pid, SIGTERM, 1); put_pid(server->m.wdog_pid); diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h index b81e97adc5a9..7fa17e459366 100644 --- a/fs/ncpfs/ncp_fs_sb.h +++ b/fs/ncpfs/ncp_fs_sb.h @@ -45,9 +45,7 @@ struct ncp_server { __u8 name_space[NCP_NUMBER_OF_VOLUMES + 2]; - struct file *ncp_filp; /* File pointer to ncp socket */ struct socket *ncp_sock;/* ncp socket */ - struct file *info_filp; struct socket *info_sock; u8 sequence; From dd20908a8a06b22c171f6c3fcdbdbd65bed07505 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Mar 2014 10:56:20 -0400 Subject: [PATCH 07/63] don't bother with {get,put}_write_access() on non-regular files it's pointless and actually leads to wrong behaviour in at least one moderately convoluted case (pipe(), close one end, try to get to another via /proc/*/fd and run into ETXTBUSY). Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/file_table.c | 4 ++-- fs/open.c | 26 +++++++------------------- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index 5b24008ea4f6..79ecae62209a 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -209,10 +209,10 @@ static void drop_file_write_access(struct file *file) struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; - put_write_access(inode); - if (special_file(inode->i_mode)) return; + + put_write_access(inode); if (file_check_writeable(file) != 0) return; __mnt_drop_write(mnt); diff --git a/fs/open.c b/fs/open.c index b9ed8b25c108..2ed7325f713e 100644 --- a/fs/open.c +++ b/fs/open.c @@ -641,23 +641,12 @@ out: static inline int __get_file_write_access(struct inode *inode, struct vfsmount *mnt) { - int error; - error = get_write_access(inode); + int error = get_write_access(inode); if (error) return error; - /* - * Do not take mount writer counts on - * special files since no writes to - * the mount itself will occur. - */ - if (!special_file(inode->i_mode)) { - /* - * Balanced in __fput() - */ - error = __mnt_want_write(mnt); - if (error) - put_write_access(inode); - } + error = __mnt_want_write(mnt); + if (error) + put_write_access(inode); return error; } @@ -690,12 +679,11 @@ static int do_dentry_open(struct file *f, path_get(&f->f_path); inode = f->f_inode = f->f_path.dentry->d_inode; - if (f->f_mode & FMODE_WRITE) { + if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = __get_file_write_access(inode, f->f_path.mnt); if (error) goto cleanup_file; - if (!special_file(inode->i_mode)) - file_take_write(f); + file_take_write(f); } f->f_mapping = inode->i_mapping; @@ -742,7 +730,6 @@ static int do_dentry_open(struct file *f, cleanup_all: fops_put(f->f_op); if (f->f_mode & FMODE_WRITE) { - put_write_access(inode); if (!special_file(inode->i_mode)) { /* * We don't consider this a real @@ -750,6 +737,7 @@ cleanup_all: * because it all happenend right * here, so just reset the state. */ + put_write_access(inode); file_reset_write(f); __mnt_drop_write(f->f_path.mnt); } From 4597e695b8baa3e2620da89c7593be70cf20566b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Mar 2014 10:06:32 -0400 Subject: [PATCH 08/63] get rid of DEBUG_WRITECOUNT it only makes control flow in __fput() and friends more convoluted. Signed-off-by: Al Viro --- arch/powerpc/configs/ppc6xx_defconfig | 1 - arch/powerpc/configs/ps3_defconfig | 1 - arch/s390/configs/default_defconfig | 1 - arch/sh/configs/rsk7203_defconfig | 1 - arch/xtensa/configs/iss_defconfig | 1 - arch/xtensa/configs/s6105_defconfig | 1 - fs/file_table.c | 5 --- fs/open.c | 8 ----- include/linux/fs.h | 49 --------------------------- lib/Kconfig.debug | 10 ------ 10 files changed, 78 deletions(-) diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index c2353bf059fd..175a8b99c196 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -1244,7 +1244,6 @@ CONFIG_DEBUG_SPINLOCK_SLEEP=y CONFIG_DEBUG_HIGHMEM=y CONFIG_DEBUG_INFO=y CONFIG_DEBUG_VM=y -CONFIG_DEBUG_WRITECOUNT=y CONFIG_DEBUG_LIST=y CONFIG_DEBUG_SG=y # CONFIG_RCU_CPU_STALL_DETECTOR is not set diff --git a/arch/powerpc/configs/ps3_defconfig b/arch/powerpc/configs/ps3_defconfig index 139a8308070c..fdee37fab81c 100644 --- a/arch/powerpc/configs/ps3_defconfig +++ b/arch/powerpc/configs/ps3_defconfig @@ -174,7 +174,6 @@ CONFIG_DETECT_HUNG_TASK=y CONFIG_PROVE_LOCKING=y CONFIG_DEBUG_LOCKDEP=y CONFIG_DEBUG_INFO=y -CONFIG_DEBUG_WRITECOUNT=y CONFIG_DEBUG_MEMORY_INIT=y CONFIG_DEBUG_LIST=y CONFIG_RCU_CPU_STALL_TIMEOUT=60 diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig index e0af2ee58751..3f538468a86e 100644 --- a/arch/s390/configs/default_defconfig +++ b/arch/s390/configs/default_defconfig @@ -550,7 +550,6 @@ CONFIG_LOCK_STAT=y CONFIG_DEBUG_LOCKDEP=y CONFIG_DEBUG_ATOMIC_SLEEP=y CONFIG_DEBUG_LOCKING_API_SELFTESTS=y -CONFIG_DEBUG_WRITECOUNT=y CONFIG_DEBUG_LIST=y CONFIG_DEBUG_SG=y CONFIG_DEBUG_NOTIFIERS=y diff --git a/arch/sh/configs/rsk7203_defconfig b/arch/sh/configs/rsk7203_defconfig index 4e5229b0c5bb..47236573db83 100644 --- a/arch/sh/configs/rsk7203_defconfig +++ b/arch/sh/configs/rsk7203_defconfig @@ -128,7 +128,6 @@ CONFIG_DEBUG_MUTEXES=y CONFIG_DEBUG_SPINLOCK_SLEEP=y CONFIG_DEBUG_INFO=y CONFIG_DEBUG_VM=y -CONFIG_DEBUG_WRITECOUNT=y CONFIG_DEBUG_LIST=y CONFIG_DEBUG_SG=y CONFIG_FRAME_POINTER=y diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig index 4f233204faf9..711f8aa14743 100644 --- a/arch/xtensa/configs/iss_defconfig +++ b/arch/xtensa/configs/iss_defconfig @@ -627,7 +627,6 @@ CONFIG_SCHED_DEBUG=y # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_INFO is not set # CONFIG_DEBUG_VM is not set -# CONFIG_DEBUG_WRITECOUNT is not set # CONFIG_DEBUG_MEMORY_INIT is not set # CONFIG_DEBUG_LIST is not set # CONFIG_DEBUG_SG is not set diff --git a/arch/xtensa/configs/s6105_defconfig b/arch/xtensa/configs/s6105_defconfig index d929f77a0360..78318a76fa16 100644 --- a/arch/xtensa/configs/s6105_defconfig +++ b/arch/xtensa/configs/s6105_defconfig @@ -569,7 +569,6 @@ CONFIG_DEBUG_SPINLOCK_SLEEP=y # CONFIG_DEBUG_INFO is not set # CONFIG_DEBUG_VM is not set CONFIG_DEBUG_NOMMU_REGIONS=y -# CONFIG_DEBUG_WRITECOUNT is not set # CONFIG_DEBUG_MEMORY_INIT is not set # CONFIG_DEBUG_LIST is not set # CONFIG_DEBUG_SG is not set diff --git a/fs/file_table.c b/fs/file_table.c index 79ecae62209a..ee20658a0647 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -52,7 +52,6 @@ static void file_free_rcu(struct rcu_head *head) static inline void file_free(struct file *f) { percpu_counter_dec(&nr_files); - file_check_state(f); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -186,7 +185,6 @@ struct file *alloc_file(struct path *path, fmode_t mode, * that we can do debugging checks at __fput() */ if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) { - file_take_write(file); WARN_ON(mnt_clone_write(path->mnt)); } if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) @@ -213,10 +211,7 @@ static void drop_file_write_access(struct file *file) return; put_write_access(inode); - if (file_check_writeable(file) != 0) - return; __mnt_drop_write(mnt); - file_release_write(file); } /* the real guts of fput() - releasing the last reference to file diff --git a/fs/open.c b/fs/open.c index 2ed7325f713e..8d0b6adfe7b8 100644 --- a/fs/open.c +++ b/fs/open.c @@ -683,7 +683,6 @@ static int do_dentry_open(struct file *f, error = __get_file_write_access(inode, f->f_path.mnt); if (error) goto cleanup_file; - file_take_write(f); } f->f_mapping = inode->i_mapping; @@ -731,14 +730,7 @@ cleanup_all: fops_put(f->f_op); if (f->f_mode & FMODE_WRITE) { if (!special_file(inode->i_mode)) { - /* - * We don't consider this a real - * mnt_want/drop_write() pair - * because it all happenend right - * here, so just reset the state. - */ put_write_access(inode); - file_reset_write(f); __mnt_drop_write(f->f_path.mnt); } } diff --git a/include/linux/fs.h b/include/linux/fs.h index 23b2a35d712e..e80659ed78fc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -769,9 +769,6 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) index < ra->start + ra->size); } -#define FILE_MNT_WRITE_TAKEN 1 -#define FILE_MNT_WRITE_RELEASED 2 - struct file { union { struct llist_node fu_llist; @@ -809,9 +806,6 @@ struct file { struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; -#ifdef CONFIG_DEBUG_WRITECOUNT - unsigned long f_mnt_write_state; -#endif } __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_handle { @@ -829,49 +823,6 @@ static inline struct file *get_file(struct file *f) #define fput_atomic(x) atomic_long_add_unless(&(x)->f_count, -1, 1) #define file_count(x) atomic_long_read(&(x)->f_count) -#ifdef CONFIG_DEBUG_WRITECOUNT -static inline void file_take_write(struct file *f) -{ - WARN_ON(f->f_mnt_write_state != 0); - f->f_mnt_write_state = FILE_MNT_WRITE_TAKEN; -} -static inline void file_release_write(struct file *f) -{ - f->f_mnt_write_state |= FILE_MNT_WRITE_RELEASED; -} -static inline void file_reset_write(struct file *f) -{ - f->f_mnt_write_state = 0; -} -static inline void file_check_state(struct file *f) -{ - /* - * At this point, either both or neither of these bits - * should be set. - */ - WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN); - WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_RELEASED); -} -static inline int file_check_writeable(struct file *f) -{ - if (f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN) - return 0; - printk(KERN_WARNING "writeable file with no " - "mnt_want_write()\n"); - WARN_ON(1); - return -EINVAL; -} -#else /* !CONFIG_DEBUG_WRITECOUNT */ -static inline void file_take_write(struct file *filp) {} -static inline void file_release_write(struct file *filp) {} -static inline void file_reset_write(struct file *filp) {} -static inline void file_check_state(struct file *filp) {} -static inline int file_check_writeable(struct file *filp) -{ - return 0; -} -#endif /* CONFIG_DEBUG_WRITECOUNT */ - #define MAX_NON_LFS ((1UL<<31) - 1) /* Page cache limit. The filesystems should put that into their s_maxbytes diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a48abeac753f..7a0859314bdf 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1030,16 +1030,6 @@ config DEBUG_BUGVERBOSE of the BUG call as well as the EIP and oops trace. This aids debugging but costs about 70-100K of memory. -config DEBUG_WRITECOUNT - bool "Debug filesystem writers count" - depends on DEBUG_KERNEL - help - Enable this to catch wrong use of the writers count in struct - vfsmount. This will increase the size of each file struct by - 32 bits. - - If unsure, say N. - config DEBUG_LIST bool "Debug linked list manipulation" depends on DEBUG_KERNEL From 0ccb286346c4c0644be17f04a9eb23ad99262882 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Mar 2014 10:40:46 -0400 Subject: [PATCH 09/63] fold __get_file_write_access() into its only caller Signed-off-by: Al Viro --- fs/open.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/fs/open.c b/fs/open.c index 8d0b6adfe7b8..ebef0c5fa10c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -632,24 +632,6 @@ out: return error; } -/* - * You have to be very careful that these write - * counts get cleaned up in error cases and - * upon __fput(). This should probably never - * be called outside of __dentry_open(). - */ -static inline int __get_file_write_access(struct inode *inode, - struct vfsmount *mnt) -{ - int error = get_write_access(inode); - if (error) - return error; - error = __mnt_want_write(mnt); - if (error) - put_write_access(inode); - return error; -} - int open_check_o_direct(struct file *f) { /* NB: we're sure to have correct a_ops only after f_op->open */ @@ -680,9 +662,14 @@ static int do_dentry_open(struct file *f, path_get(&f->f_path); inode = f->f_inode = f->f_path.dentry->d_inode; if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { - error = __get_file_write_access(inode, f->f_path.mnt); + error = get_write_access(inode); if (error) goto cleanup_file; + error = __mnt_want_write(f->f_path.mnt); + if (error) { + put_write_access(inode); + goto cleanup_file; + } } f->f_mapping = inode->i_mapping; From 83f936c75e3689a63253d89c47a4d239c56d7410 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Mar 2014 12:02:47 -0400 Subject: [PATCH 10/63] mark struct file that had write access grabbed by open() new flag in ->f_mode - FMODE_WRITER. Set by do_dentry_open() in case when it has grabbed write access, checked by __fput() to decide whether it wants to drop the sucker. Allows to stop bothering with mnt_clone_write() in alloc_file(), along with fewer special_file() checks. Signed-off-by: Al Viro --- fs/file_table.c | 37 ++++--------------------------------- fs/namespace.c | 4 +--- fs/open.c | 9 ++++----- include/linux/fs.h | 2 ++ 4 files changed, 11 insertions(+), 41 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index ee20658a0647..ce1504fec5a1 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -177,43 +177,12 @@ struct file *alloc_file(struct path *path, fmode_t mode, file->f_mapping = path->dentry->d_inode->i_mapping; file->f_mode = mode; file->f_op = fop; - - /* - * These mounts don't really matter in practice - * for r/o bind mounts. They aren't userspace- - * visible. We do this for consistency, and so - * that we can do debugging checks at __fput() - */ - if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) { - WARN_ON(mnt_clone_write(path->mnt)); - } if ((mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(path->dentry->d_inode); return file; } EXPORT_SYMBOL(alloc_file); -/** - * drop_file_write_access - give up ability to write to a file - * @file: the file to which we will stop writing - * - * This is a central place which will give up the ability - * to write to @file, along with access to write through - * its vfsmount. - */ -static void drop_file_write_access(struct file *file) -{ - struct vfsmount *mnt = file->f_path.mnt; - struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; - - if (special_file(inode->i_mode)) - return; - - put_write_access(inode); - __mnt_drop_write(mnt); -} - /* the real guts of fput() - releasing the last reference to file */ static void __fput(struct file *file) @@ -248,8 +217,10 @@ static void __fput(struct file *file) put_pid(file->f_owner.pid); if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_dec(inode); - if (file->f_mode & FMODE_WRITE) - drop_file_write_access(file); + if (file->f_mode & FMODE_WRITER) { + put_write_access(inode); + __mnt_drop_write(mnt); + } file->f_path.dentry = NULL; file->f_path.mnt = NULL; file->f_inode = NULL; diff --git a/fs/namespace.c b/fs/namespace.c index a66aff5bd3fe..20e8696c31a7 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -414,9 +414,7 @@ EXPORT_SYMBOL_GPL(mnt_clone_write); */ int __mnt_want_write_file(struct file *file) { - struct inode *inode = file_inode(file); - - if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) + if (!(file->f_mode & FMODE_WRITER)) return __mnt_want_write(file->f_path.mnt); else return mnt_clone_write(file->f_path.mnt); diff --git a/fs/open.c b/fs/open.c index ebef0c5fa10c..dcefb2f02d10 100644 --- a/fs/open.c +++ b/fs/open.c @@ -670,6 +670,7 @@ static int do_dentry_open(struct file *f, put_write_access(inode); goto cleanup_file; } + f->f_mode |= FMODE_WRITER; } f->f_mapping = inode->i_mapping; @@ -715,11 +716,9 @@ static int do_dentry_open(struct file *f, cleanup_all: fops_put(f->f_op); - if (f->f_mode & FMODE_WRITE) { - if (!special_file(inode->i_mode)) { - put_write_access(inode); - __mnt_drop_write(f->f_path.mnt); - } + if (f->f_mode & FMODE_WRITER) { + put_write_access(inode); + __mnt_drop_write(f->f_path.mnt); } cleanup_file: path_put(&f->f_path); diff --git a/include/linux/fs.h b/include/linux/fs.h index e80659ed78fc..d9d88a0b456e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -125,6 +125,8 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File needs atomic accesses to f_pos */ #define FMODE_ATOMIC_POS ((__force fmode_t)0x8000) +/* Write access to underlying fs */ +#define FMODE_WRITER ((__force fmode_t)0x10000) /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x1000000) From 3f4d5a00076b7e340625a2014cb83e10bf0d6dd1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Mar 2014 09:43:29 -0400 Subject: [PATCH 11/63] tidy do_dentry_open() up a bit Signed-off-by: Al Viro --- fs/open.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/open.c b/fs/open.c index dcefb2f02d10..37f65fa44dbf 100644 --- a/fs/open.c +++ b/fs/open.c @@ -656,30 +656,28 @@ static int do_dentry_open(struct file *f, f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - if (unlikely(f->f_flags & O_PATH)) - f->f_mode = FMODE_PATH; - path_get(&f->f_path); inode = f->f_inode = f->f_path.dentry->d_inode; + f->f_mapping = inode->i_mapping; + + if (unlikely(f->f_flags & O_PATH)) { + f->f_mode = FMODE_PATH; + f->f_op = &empty_fops; + return 0; + } + if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { error = get_write_access(inode); - if (error) + if (unlikely(error)) goto cleanup_file; error = __mnt_want_write(f->f_path.mnt); - if (error) { + if (unlikely(error)) { put_write_access(inode); goto cleanup_file; } f->f_mode |= FMODE_WRITER; } - f->f_mapping = inode->i_mapping; - - if (unlikely(f->f_mode & FMODE_PATH)) { - f->f_op = &empty_fops; - return 0; - } - /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ if (S_ISREG(inode->i_mode)) f->f_mode |= FMODE_ATOMIC_POS; From 0018d8bfc4f41de27e45517380e7d90be3580b44 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Mar 2014 12:09:36 -0400 Subject: [PATCH 12/63] get_write_access() is inlined, exporting it is pointless Signed-off-by: Al Viro --- fs/namei.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/namei.c b/fs/namei.c index 4b491b431990..2fb4fe57f4b1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4413,7 +4413,6 @@ EXPORT_SYMBOL(user_path_at); EXPORT_SYMBOL(follow_down_one); EXPORT_SYMBOL(follow_down); EXPORT_SYMBOL(follow_up); -EXPORT_SYMBOL(get_write_access); /* nfsd */ EXPORT_SYMBOL(lock_rename); EXPORT_SYMBOL(lookup_one_len); EXPORT_SYMBOL(page_follow_link_light); From 4d359507346a156491300cc193252b525892ae91 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Mar 2014 12:20:17 -0400 Subject: [PATCH 13/63] namei.c: move EXPORT_SYMBOL to corresponding definitions Signed-off-by: Al Viro --- fs/namei.c | 55 +++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 2fb4fe57f4b1..617de9e9967b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -358,6 +358,7 @@ int generic_permission(struct inode *inode, int mask) return -EACCES; } +EXPORT_SYMBOL(generic_permission); /* * We _really_ want to just do "generic_permission()" without @@ -455,6 +456,7 @@ int inode_permission(struct inode *inode, int mask) return retval; return __inode_permission(inode, mask); } +EXPORT_SYMBOL(inode_permission); /** * path_get - get a reference to a path @@ -924,6 +926,7 @@ int follow_up(struct path *path) path->mnt = &parent->mnt; return 1; } +EXPORT_SYMBOL(follow_up); /* * Perform an automount @@ -1085,6 +1088,7 @@ int follow_down_one(struct path *path) } return 0; } +EXPORT_SYMBOL(follow_down_one); static inline bool managed_dentry_might_block(struct dentry *dentry) { @@ -1223,6 +1227,7 @@ int follow_down(struct path *path) } return 0; } +EXPORT_SYMBOL(follow_down); /* * Skip to top of mountpoint pile in refwalk mode for follow_dotdot() @@ -2025,6 +2030,7 @@ int kern_path(const char *name, unsigned int flags, struct path *path) *path = nd.path; return res; } +EXPORT_SYMBOL(kern_path); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair @@ -2049,6 +2055,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, *path = nd.path; return err; } +EXPORT_SYMBOL(vfs_path_lookup); /* * Restricted form of lookup. Doesn't follow links, single-component only, @@ -2111,6 +2118,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) return __lookup_hash(&this, base, 0); } +EXPORT_SYMBOL(lookup_one_len); int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) @@ -2135,6 +2143,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags, { return user_path_at_empty(dfd, name, flags, path, NULL); } +EXPORT_SYMBOL(user_path_at); /* * NB: most callers don't do anything directly with the reference to the @@ -2477,6 +2486,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); return NULL; } +EXPORT_SYMBOL(lock_rename); void unlock_rename(struct dentry *p1, struct dentry *p2) { @@ -2486,6 +2496,7 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); } } +EXPORT_SYMBOL(unlock_rename); int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) @@ -2506,6 +2517,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_create); static int may_open(struct path *path, int acc_mode, int flag) { @@ -3376,6 +3388,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) { @@ -3465,6 +3478,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) fsnotify_mkdir(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_mkdir); SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { @@ -3519,6 +3533,7 @@ void dentry_unhash(struct dentry *dentry) __d_drop(dentry); spin_unlock(&dentry->d_lock); } +EXPORT_SYMBOL(dentry_unhash); int vfs_rmdir(struct inode *dir, struct dentry *dentry) { @@ -3556,6 +3571,7 @@ out: d_delete(dentry); return error; } +EXPORT_SYMBOL(vfs_rmdir); static long do_rmdir(int dfd, const char __user *pathname) { @@ -3673,6 +3689,7 @@ out: return error; } +EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its @@ -3786,6 +3803,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) fsnotify_create(dir, dentry); return error; } +EXPORT_SYMBOL(vfs_symlink); SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname) @@ -3894,6 +3912,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de fsnotify_link(dir, inode, new_dentry); return error; } +EXPORT_SYMBOL(vfs_link); /* * Hardlinks are often used in delicate situations. We avoid @@ -4156,6 +4175,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, return error; } +EXPORT_SYMBOL(vfs_rename); SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname) @@ -4293,6 +4313,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const c out: return len; } +EXPORT_SYMBOL(vfs_readlink); /* * A helper for ->readlink(). This should be used *ONLY* for symlinks that @@ -4315,6 +4336,7 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) dentry->d_inode->i_op->put_link(dentry, &nd, cookie); return res; } +EXPORT_SYMBOL(generic_readlink); /* get the link contents into pagecache */ static char *page_getlink(struct dentry * dentry, struct page **ppage) @@ -4342,6 +4364,7 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) } return res; } +EXPORT_SYMBOL(page_readlink); void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) { @@ -4349,6 +4372,7 @@ void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) nd_set_link(nd, page_getlink(dentry, &page)); return page; } +EXPORT_SYMBOL(page_follow_link_light); void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) { @@ -4359,6 +4383,7 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) page_cache_release(page); } } +EXPORT_SYMBOL(page_put_link); /* * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS @@ -4396,44 +4421,18 @@ retry: fail: return err; } +EXPORT_SYMBOL(__page_symlink); int page_symlink(struct inode *inode, const char *symname, int len) { return __page_symlink(inode, symname, len, !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); } +EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, }; - -EXPORT_SYMBOL(user_path_at); -EXPORT_SYMBOL(follow_down_one); -EXPORT_SYMBOL(follow_down); -EXPORT_SYMBOL(follow_up); -EXPORT_SYMBOL(lock_rename); -EXPORT_SYMBOL(lookup_one_len); -EXPORT_SYMBOL(page_follow_link_light); -EXPORT_SYMBOL(page_put_link); -EXPORT_SYMBOL(page_readlink); -EXPORT_SYMBOL(__page_symlink); -EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); -EXPORT_SYMBOL(kern_path); -EXPORT_SYMBOL(vfs_path_lookup); -EXPORT_SYMBOL(inode_permission); -EXPORT_SYMBOL(unlock_rename); -EXPORT_SYMBOL(vfs_create); -EXPORT_SYMBOL(vfs_link); -EXPORT_SYMBOL(vfs_mkdir); -EXPORT_SYMBOL(vfs_mknod); -EXPORT_SYMBOL(generic_permission); -EXPORT_SYMBOL(vfs_readlink); -EXPORT_SYMBOL(vfs_rename); -EXPORT_SYMBOL(vfs_rmdir); -EXPORT_SYMBOL(vfs_symlink); -EXPORT_SYMBOL(vfs_unlink); -EXPORT_SYMBOL(dentry_unhash); -EXPORT_SYMBOL(generic_readlink); From 7f4b36f9bb930b3b2105a9a2cb0121fa7028c432 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Mar 2014 12:45:29 -0400 Subject: [PATCH 14/63] get rid of files_defer_init() the only thing it's doing these days is calculation of upper limit for fs.nr_open sysctl and that can be done statically Signed-off-by: Al Viro --- fs/file.c | 11 ++++------- fs/file_table.c | 1 - include/linux/fdtable.h | 2 -- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/file.c b/fs/file.c index eb56a13dab3e..682103b95f8f 100644 --- a/fs/file.c +++ b/fs/file.c @@ -25,7 +25,10 @@ int sysctl_nr_open __read_mostly = 1024*1024; int sysctl_nr_open_min = BITS_PER_LONG; -int sysctl_nr_open_max = 1024 * 1024; /* raised later */ +/* our max() is unusable in constant expressions ;-/ */ +#define __const_max(x, y) ((x) < (y) ? (x) : (y)) +int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) & + -BITS_PER_LONG; static void *alloc_fdmem(size_t size) { @@ -429,12 +432,6 @@ void exit_files(struct task_struct *tsk) } } -void __init files_defer_init(void) -{ - sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) & - -BITS_PER_LONG; -} - struct files_struct init_files = { .count = ATOMIC_INIT(1), .fdt = &init_files.fdtab, diff --git a/fs/file_table.c b/fs/file_table.c index ce1504fec5a1..718e8e5224f8 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -325,6 +325,5 @@ void __init files_init(unsigned long mempages) n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - files_defer_init(); percpu_counter_init(&nr_files, 0); } diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 70e8e21c0a30..230f87bdf5ad 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -63,8 +63,6 @@ struct file_operations; struct vfsmount; struct dentry; -extern void __init files_defer_init(void); - #define rcu_dereference_check_fdtable(files, fdtfd) \ rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock)) From 4efcc9ffcd4fc53f1f7de539842cdffa1f8e5ecc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Mar 2014 12:54:25 -0400 Subject: [PATCH 15/63] lustre: generic_readlink() is just fine there, TYVM... Signed-off-by: Al Viro --- drivers/staging/lustre/lustre/llite/symlink.c | 23 +------------------ 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/drivers/staging/lustre/lustre/llite/symlink.c b/drivers/staging/lustre/lustre/llite/symlink.c index ab06891f7fc7..80d48b5ae247 100644 --- a/drivers/staging/lustre/lustre/llite/symlink.c +++ b/drivers/staging/lustre/lustre/llite/symlink.c @@ -115,27 +115,6 @@ failed: return rc; } -static int ll_readlink(struct dentry *dentry, char *buffer, int buflen) -{ - struct inode *inode = dentry->d_inode; - struct ptlrpc_request *request; - char *symname; - int rc; - - CDEBUG(D_VFSTRACE, "VFS Op\n"); - - ll_inode_size_lock(inode); - rc = ll_readlink_internal(inode, &request, &symname); - if (rc) - GOTO(out, rc); - - rc = vfs_readlink(dentry, buffer, buflen, symname); - out: - ptlrpc_req_finished(request); - ll_inode_size_unlock(inode); - return rc; -} - static void *ll_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -175,7 +154,7 @@ static void ll_put_link(struct dentry *dentry, struct nameidata *nd, void *cooki } struct inode_operations ll_fast_symlink_inode_operations = { - .readlink = ll_readlink, + .readlink = generic_readlink, .setattr = ll_setattr, .follow_link = ll_follow_link, .put_link = ll_put_link, From 5d826c847b34de6415b4f1becd88a57ff619af50 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 14 Mar 2014 13:42:45 -0400 Subject: [PATCH 16/63] new helper: readlink_copy() Signed-off-by: Al Viro --- fs/namei.c | 13 +++++-------- fs/proc/namespaces.c | 14 ++++---------- fs/proc/self.c | 2 +- fs/xfs/xfs_ioctl.c | 28 +--------------------------- include/linux/fs.h | 2 +- 5 files changed, 12 insertions(+), 47 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 617de9e9967b..4fb52f0ca5cb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4297,11 +4297,9 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna return sys_renameat(AT_FDCWD, oldname, AT_FDCWD, newname); } -int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const char *link) +int readlink_copy(char __user *buffer, int buflen, const char *link) { - int len; - - len = PTR_ERR(link); + int len = PTR_ERR(link); if (IS_ERR(link)) goto out; @@ -4313,7 +4311,7 @@ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen, const c out: return len; } -EXPORT_SYMBOL(vfs_readlink); +EXPORT_SYMBOL(readlink_copy); /* * A helper for ->readlink(). This should be used *ONLY* for symlinks that @@ -4331,7 +4329,7 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) if (IS_ERR(cookie)) return PTR_ERR(cookie); - res = vfs_readlink(dentry, buffer, buflen, nd_get_link(&nd)); + res = readlink_copy(buffer, buflen, nd_get_link(&nd)); if (dentry->d_inode->i_op->put_link) dentry->d_inode->i_op->put_link(dentry, &nd, cookie); return res; @@ -4356,8 +4354,7 @@ static char *page_getlink(struct dentry * dentry, struct page **ppage) int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct page *page = NULL; - char *s = page_getlink(dentry, &page); - int res = vfs_readlink(dentry,buffer,buflen,s); + int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page)); if (page) { kunmap(page); page_cache_release(page); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 9ae46b87470d..89026095f2b5 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -146,7 +146,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl struct task_struct *task; void *ns; char name[50]; - int len = -EACCES; + int res = -EACCES; task = get_proc_task(inode); if (!task) @@ -155,24 +155,18 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto out_put_task; - len = -ENOENT; + res = -ENOENT; ns = ns_ops->get(task); if (!ns) goto out_put_task; snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns)); - len = strlen(name); - - if (len > buflen) - len = buflen; - if (copy_to_user(buffer, name, len)) - len = -EFAULT; - + res = readlink_copy(buffer, buflen, name); ns_ops->put(ns); out_put_task: put_task_struct(task); out: - return len; + return res; } static const struct inode_operations proc_ns_link_inode_operations = { diff --git a/fs/proc/self.c b/fs/proc/self.c index ffeb202ec942..4348bb8907c2 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -16,7 +16,7 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer, if (!tgid) return -ENOENT; sprintf(tmp, "%d", tgid); - return vfs_readlink(dentry,buffer,buflen,tmp); + return readlink_copy(buffer, buflen, tmp); } static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index bcfe61202115..0b18776b075e 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -271,32 +271,6 @@ xfs_open_by_handle( return error; } -/* - * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's - * unused first argument. - */ -STATIC int -do_readlink( - char __user *buffer, - int buflen, - const char *link) -{ - int len; - - len = PTR_ERR(link); - if (IS_ERR(link)) - goto out; - - len = strlen(link); - if (len > (unsigned) buflen) - len = buflen; - if (copy_to_user(buffer, link, len)) - len = -EFAULT; - out: - return len; -} - - int xfs_readlink_by_handle( struct file *parfilp, @@ -334,7 +308,7 @@ xfs_readlink_by_handle( error = -xfs_readlink(XFS_I(dentry->d_inode), link); if (error) goto out_kfree; - error = do_readlink(hreq->ohandle, olen, link); + error = readlink_copy(hreq->ohandle, olen, link); if (error) goto out_kfree; diff --git a/include/linux/fs.h b/include/linux/fs.h index d9d88a0b456e..db181b542db1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2517,7 +2517,7 @@ extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) -extern int vfs_readlink(struct dentry *, char __user *, int, const char *); +extern int readlink_copy(char __user *, int, const char *); extern int page_readlink(struct dentry *, char __user *, int); extern void *page_follow_link_light(struct dentry *, struct nameidata *); extern void page_put_link(struct dentry *, struct nameidata *, void *); From 05faf3169f039cb03ebabdfee6eda0e7ada5ea11 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 1 Feb 2014 04:41:36 -0500 Subject: [PATCH 17/63] ntfs: don't put NULL into ->i_op/->i_fop Signed-off-by: Al Viro --- fs/ntfs/inode.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index ffb9b3675736..4de660fe739c 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1704,8 +1704,6 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) iput(bvi); skip_large_index_stuff: /* Setup the operations for this index inode. */ - vi->i_op = NULL; - vi->i_fop = NULL; vi->i_mapping->a_ops = &ntfs_mst_aops; vi->i_blocks = ni->allocated_size >> 9; /* From 627bf81ac625f05060db033a0f3791521ad7bd79 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 1 Feb 2014 04:43:32 -0500 Subject: [PATCH 18/63] get rid of pointless checks for NULL ->i_op Signed-off-by: Al Viro --- fs/cachefiles/bind.c | 1 - fs/cachefiles/namei.c | 3 +-- security/integrity/evm/evm_crypto.c | 2 +- security/integrity/evm/evm_main.c | 2 +- security/tomoyo/realpath.c | 4 ++-- 5 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index 622f4696e484..5b99bafc31d1 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -124,7 +124,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) /* check parameters */ ret = -EOPNOTSUPP; if (!root->d_inode || - !root->d_inode->i_op || !root->d_inode->i_op->lookup || !root->d_inode->i_op->mkdir || !root->d_inode->i_op->setxattr || diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index ca65f39dc8dc..1b1283bff8de 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -779,8 +779,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, } ret = -EPERM; - if (!subdir->d_inode->i_op || - !subdir->d_inode->i_op->setxattr || + if (!subdir->d_inode->i_op->setxattr || !subdir->d_inode->i_op->getxattr || !subdir->d_inode->i_op->lookup || !subdir->d_inode->i_op->mkdir || diff --git a/security/integrity/evm/evm_crypto.c b/security/integrity/evm/evm_crypto.c index 3bab89eb21d6..e90ab0e20db8 100644 --- a/security/integrity/evm/evm_crypto.c +++ b/security/integrity/evm/evm_crypto.c @@ -137,7 +137,7 @@ static int evm_calc_hmac_or_hash(struct dentry *dentry, int error; int size; - if (!inode->i_op || !inode->i_op->getxattr) + if (!inode->i_op->getxattr) return -EOPNOTSUPP; desc = init_desc(type); if (IS_ERR(desc)) diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index 336b3ddfe63f..bab1c39ffcaf 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -62,7 +62,7 @@ static int evm_find_protected_xattrs(struct dentry *dentry) int error; int count = 0; - if (!inode->i_op || !inode->i_op->getxattr) + if (!inode->i_op->getxattr) return -EOPNOTSUPP; for (xattr = evm_config_xattrnames; *xattr != NULL; xattr++) { diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c index 80a09c37cac8..a3386d119425 100644 --- a/security/tomoyo/realpath.c +++ b/security/tomoyo/realpath.c @@ -173,7 +173,7 @@ static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer, * Use filesystem name if filesystem does not support rename() * operation. */ - if (inode->i_op && !inode->i_op->rename) + if (!inode->i_op->rename) goto prepend_filesystem_name; } /* Prepend device name. */ @@ -282,7 +282,7 @@ char *tomoyo_realpath_from_path(struct path *path) * Get local name for filesystems without rename() operation * or dentry without vfsmount. */ - if (!path->mnt || (inode->i_op && !inode->i_op->rename)) + if (!path->mnt || !inode->i_op->rename) pos = tomoyo_get_local_path(path->dentry, buf, buf_len - 1); /* Get absolute name for the rest. */ From 81c5a68478be38816bb5110ae0a5de1320cd2dfd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 1 Feb 2014 08:26:29 -0500 Subject: [PATCH 19/63] cifs: ->rename() without ->lookup() makes no sense Signed-off-by: Al Viro --- fs/cifs/cifsfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 849f6132b327..f31f9d6913b2 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -849,7 +849,6 @@ const struct inode_operations cifs_file_inode_ops = { /* revalidate:cifs_revalidate, */ .setattr = cifs_setattr, .getattr = cifs_getattr, /* do we need this anymore? */ - .rename = cifs_rename, .permission = cifs_permission, #ifdef CONFIG_CIFS_XATTR .setxattr = cifs_setxattr, From 3ef120a459260b35175a64a418bdb115d80bf58f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 2 Feb 2014 06:31:19 -0500 Subject: [PATCH 20/63] mn10300: kmap_atomic() returns void *, not unsigned long... Signed-off-by: Al Viro --- arch/mn10300/include/asm/highmem.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mn10300/include/asm/highmem.h b/arch/mn10300/include/asm/highmem.h index 7c137cd8aa37..2fbbe4d920aa 100644 --- a/arch/mn10300/include/asm/highmem.h +++ b/arch/mn10300/include/asm/highmem.h @@ -70,7 +70,7 @@ static inline void kunmap(struct page *page) * be used in IRQ contexts, so in some (very limited) cases we need * it. */ -static inline unsigned long kmap_atomic(struct page *page) +static inline void *kmap_atomic(struct page *page) { unsigned long vaddr; int idx, type; @@ -89,7 +89,7 @@ static inline unsigned long kmap_atomic(struct page *page) set_pte(kmap_pte - idx, mk_pte(page, kmap_prot)); local_flush_tlb_one(vaddr); - return vaddr; + return (void *)vaddr; } static inline void __kunmap_atomic(unsigned long vaddr) From 8ffcb32e05239f0e53abfb0a1bc4eee4855b7fd2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 24 Jan 2014 12:17:54 +0000 Subject: [PATCH 21/63] VFS: Make delayed_free() call free_vfsmnt() Make delayed_free() call free_vfsmnt() so that we don't have two functions doing the same job. This requires the calls to mnt_free_id() in free_vfsmnt() to be moved into the callers of that function. Signed-off-by: David Howells Signed-off-by: Al Viro --- fs/namespace.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 20e8696c31a7..182bc41cd887 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -568,13 +568,17 @@ int sb_prepare_remount_readonly(struct super_block *sb) static void free_vfsmnt(struct mount *mnt) { kfree(mnt->mnt_devname); - mnt_free_id(mnt); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); #endif kmem_cache_free(mnt_cache, mnt); } +static void delayed_free_vfsmnt(struct rcu_head *head) +{ + free_vfsmnt(container_of(head, struct mount, mnt_rcu)); +} + /* call under rcu_read_lock */ bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) { @@ -846,6 +850,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void root = mount_fs(type, flags, name, data); if (IS_ERR(root)) { + mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_CAST(root); } @@ -926,20 +931,11 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, return mnt; out_free: + mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_PTR(err); } -static void delayed_free(struct rcu_head *head) -{ - struct mount *mnt = container_of(head, struct mount, mnt_rcu); - kfree(mnt->mnt_devname); -#ifdef CONFIG_SMP - free_percpu(mnt->mnt_pcp); -#endif - kmem_cache_free(mnt_cache, mnt); -} - static void mntput_no_expire(struct mount *mnt) { put_again: @@ -989,7 +985,7 @@ put_again: dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); mnt_free_id(mnt); - call_rcu(&mnt->mnt_rcu, delayed_free); + call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); } void mntput(struct vfsmount *mnt) From 58bda1da4b3c3ec6da8d0badaba12ce02839a241 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 2 Feb 2014 21:03:40 -0500 Subject: [PATCH 22/63] fuse/dev: use atomic maps Signed-off-by: Al Viro --- fs/fuse/dev.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 0a648bb455ae..f90af6fe6884 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -669,13 +669,13 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) if (!cs->write) { buf->ops->unmap(cs->pipe, buf, cs->mapaddr); } else { - kunmap(buf->page); + kunmap_atomic(cs->mapaddr); buf->len = PAGE_SIZE - cs->len; } cs->currbuf = NULL; cs->mapaddr = NULL; } else if (cs->mapaddr) { - kunmap(cs->pg); + kunmap_atomic(cs->mapaddr); if (cs->write) { flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); @@ -706,7 +706,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->currbuf = buf; - cs->mapaddr = buf->ops->map(cs->pipe, buf, 0); + cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); cs->len = buf->len; cs->buf = cs->mapaddr + buf->offset; cs->pipebufs++; @@ -726,7 +726,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) buf->len = 0; cs->currbuf = buf; - cs->mapaddr = kmap(page); + cs->mapaddr = kmap_atomic(page); cs->buf = cs->mapaddr; cs->len = PAGE_SIZE; cs->pipebufs++; @@ -745,7 +745,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) return err; BUG_ON(err != 1); offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap(cs->pg); + cs->mapaddr = kmap_atomic(cs->pg); cs->buf = cs->mapaddr + offset; cs->len = min(PAGE_SIZE - offset, cs->seglen); cs->seglen -= cs->len; From fbb32750a62df75d1ffea547f3908b21c5496d9f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 2 Feb 2014 21:09:54 -0500 Subject: [PATCH 23/63] pipe: kill ->map() and ->unmap() all pipe_buffer_operations have the same instances of those... Signed-off-by: Al Viro --- drivers/char/virtio_console.c | 4 +- fs/fuse/dev.c | 6 +-- fs/pipe.c | 70 ++++++++--------------------------- fs/splice.c | 24 ++++-------- include/linux/pipe_fs_i.h | 19 ---------- kernel/relay.c | 2 - kernel/trace/trace.c | 4 -- 7 files changed, 29 insertions(+), 100 deletions(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 6928d094451d..60aafb8a1f2e 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -901,9 +901,9 @@ static int pipe_to_sg(struct pipe_inode_info *pipe, struct pipe_buffer *buf, if (len + offset > PAGE_SIZE) len = PAGE_SIZE - offset; - src = buf->ops->map(pipe, buf, 1); + src = kmap_atomic(buf->page); memcpy(page_address(page) + offset, src + buf->offset, len); - buf->ops->unmap(pipe, buf, src); + kunmap_atomic(src); sg_set_page(&(sgl->sg[sgl->n]), page, len, offset); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f90af6fe6884..aac71ce373e4 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -667,7 +667,7 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) struct pipe_buffer *buf = cs->currbuf; if (!cs->write) { - buf->ops->unmap(cs->pipe, buf, cs->mapaddr); + kunmap_atomic(cs->mapaddr); } else { kunmap_atomic(cs->mapaddr); buf->len = PAGE_SIZE - cs->len; @@ -706,7 +706,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->currbuf = buf; - cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->mapaddr = kmap_atomic(buf->page); cs->len = buf->len; cs->buf = cs->mapaddr + buf->offset; cs->pipebufs++; @@ -874,7 +874,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) out_fallback_unlock: unlock_page(newpage); out_fallback: - cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->mapaddr = kmap_atomic(buf->page); cs->buf = cs->mapaddr + buf->offset; err = lock_request(cs->fc, cs->req); diff --git a/fs/pipe.c b/fs/pipe.c index 78fd0d0788db..6679c95eb4c3 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -225,52 +225,6 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, page_cache_release(page); } -/** - * generic_pipe_buf_map - virtually map a pipe buffer - * @pipe: the pipe that the buffer belongs to - * @buf: the buffer that should be mapped - * @atomic: whether to use an atomic map - * - * Description: - * This function returns a kernel virtual address mapping for the - * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided - * and the caller has to be careful not to fault before calling - * the unmap function. - * - * Note that this function calls kmap_atomic() if @atomic != 0. - */ -void *generic_pipe_buf_map(struct pipe_inode_info *pipe, - struct pipe_buffer *buf, int atomic) -{ - if (atomic) { - buf->flags |= PIPE_BUF_FLAG_ATOMIC; - return kmap_atomic(buf->page); - } - - return kmap(buf->page); -} -EXPORT_SYMBOL(generic_pipe_buf_map); - -/** - * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer - * @pipe: the pipe that the buffer belongs to - * @buf: the buffer that should be unmapped - * @map_data: the data that the mapping function returned - * - * Description: - * This function undoes the mapping that ->map() provided. - */ -void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, - struct pipe_buffer *buf, void *map_data) -{ - if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { - buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; - kunmap_atomic(map_data); - } else - kunmap(buf->page); -} -EXPORT_SYMBOL(generic_pipe_buf_unmap); - /** * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer * @pipe: the pipe that the buffer belongs to @@ -351,8 +305,6 @@ EXPORT_SYMBOL(generic_pipe_buf_release); static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -361,8 +313,6 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { static const struct pipe_buf_operations packet_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -410,9 +360,15 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, atomic = !iov_fault_in_pages_write(iov, chars); redo: - addr = ops->map(pipe, buf, atomic); + if (atomic) + addr = kmap_atomic(buf->page); + else + addr = kmap(buf->page); error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); - ops->unmap(pipe, buf, addr); + if (atomic) + kunmap_atomic(addr); + else + kunmap(buf->page); if (unlikely(error)) { /* * Just retry with the slow path if we failed. @@ -538,10 +494,16 @@ pipe_write(struct kiocb *iocb, const struct iovec *_iov, iov_fault_in_pages_read(iov, chars); redo1: - addr = ops->map(pipe, buf, atomic); + if (atomic) + addr = kmap_atomic(buf->page); + else + addr = kmap(buf->page); error = pipe_iov_copy_from_user(offset + addr, iov, chars, atomic); - ops->unmap(pipe, buf, addr); + if (atomic) + kunmap_atomic(addr); + else + kunmap(buf->page); ret = error; do_wakeup = 1; if (error) { diff --git a/fs/splice.c b/fs/splice.c index 12028fa41def..ca3bfbd970f3 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -136,8 +136,6 @@ error: const struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = page_cache_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, @@ -156,8 +154,6 @@ static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, static const struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = page_cache_pipe_buf_release, .steal = user_page_pipe_buf_steal, @@ -547,8 +543,6 @@ EXPORT_SYMBOL(generic_file_splice_read); static const struct pipe_buf_operations default_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -564,8 +558,6 @@ static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, /* Pipe buffer operations for a socket and similar. */ const struct pipe_buf_operations nosteal_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_nosteal, @@ -767,13 +759,13 @@ int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, goto out; if (buf->page != page) { - char *src = buf->ops->map(pipe, buf, 1); + char *src = kmap_atomic(buf->page); char *dst = kmap_atomic(page); memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); kunmap_atomic(dst); - buf->ops->unmap(pipe, buf, src); + kunmap_atomic(src); } ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, page, fsdata); @@ -1067,9 +1059,9 @@ static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, void *data; loff_t tmp = sd->pos; - data = buf->ops->map(pipe, buf, 0); + data = kmap(buf->page); ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); - buf->ops->unmap(pipe, buf, data); + kunmap(buf->page); return ret; } @@ -1536,10 +1528,10 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, * pages and doing an atomic copy */ if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { - src = buf->ops->map(pipe, buf, 1); + src = kmap_atomic(buf->page); ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, sd->len); - buf->ops->unmap(pipe, buf, src); + kunmap_atomic(src); if (!ret) { ret = sd->len; goto out; @@ -1549,13 +1541,13 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, /* * No dice, use slow non-atomic map and copy */ - src = buf->ops->map(pipe, buf, 0); + src = kmap(buf->page); ret = sd->len; if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) ret = -EFAULT; - buf->ops->unmap(pipe, buf, src); + kunmap(buf->page); out: if (ret > 0) sd->u.userptr += ret; diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index ab5752692113..6dffcebe6105 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -82,23 +82,6 @@ struct pipe_buf_operations { */ int can_merge; - /* - * ->map() returns a virtual address mapping of the pipe buffer. - * The last integer flag reflects whether this should be an atomic - * mapping or not. The atomic map is faster, however you can't take - * page faults before calling ->unmap() again. So if you need to eg - * access user data through copy_to/from_user(), then you must get - * a non-atomic map. ->map() uses the kmap_atomic slot for - * atomic maps, you have to be careful if mapping another page as - * source or destination for a copy. - */ - void * (*map)(struct pipe_inode_info *, struct pipe_buffer *, int); - - /* - * Undoes ->map(), finishes the virtual mapping of the pipe buffer. - */ - void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *, void *); - /* * ->confirm() verifies that the data in the pipe buffer is there * and that the contents are good. If the pages in the pipe belong @@ -150,8 +133,6 @@ struct pipe_inode_info *alloc_pipe_info(void); void free_pipe_info(struct pipe_inode_info *); /* Generic pipe buffer ops functions */ -void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int); -void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void *); void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); diff --git a/kernel/relay.c b/kernel/relay.c index 5001c9887db1..98833f664fb6 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1195,8 +1195,6 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, static const struct pipe_buf_operations relay_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = relay_pipe_buf_release, .steal = generic_pipe_buf_steal, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 24c1f2382557..7511de35257f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4316,8 +4316,6 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, static const struct pipe_buf_operations tracing_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = generic_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -5194,8 +5192,6 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, /* Pipe buffer operations for a buffer. */ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .can_merge = 0, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, .steal = generic_pipe_buf_steal, From c186afb4dbd0050a537b96c7fbee2dba3b57fc38 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 2 Feb 2014 21:16:54 -0500 Subject: [PATCH 24/63] switch ->is_partially_uptodate() to saner arguments Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 2 +- Documentation/filesystems/vfs.txt | 2 +- fs/buffer.c | 6 +++--- include/linux/buffer_head.h | 4 ++-- include/linux/fs.h | 2 +- mm/filemap.c | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 5b0c083d7c0e..bb2534bc0b03 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -198,7 +198,7 @@ prototypes: unsigned long *); int (*migratepage)(struct address_space *, struct page *, struct page *); int (*launder_page)(struct page *); - int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long); + int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); int (*error_remove_page)(struct address_space *, struct page *); int (*swap_activate)(struct file *); int (*swap_deactivate)(struct file *); diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index c53784c119c8..419e7348c481 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -580,7 +580,7 @@ struct address_space_operations { /* migrate the contents of a page to the specified target */ int (*migratepage) (struct page *, struct page *); int (*launder_page) (struct page *); - int (*is_partially_uptodate) (struct page *, read_descriptor_t *, + int (*is_partially_uptodate) (struct page *, unsigned long, unsigned long); void (*is_dirty_writeback) (struct page *, bool *, bool *); int (*error_remove_page) (struct mapping *mapping, struct page *page); diff --git a/fs/buffer.c b/fs/buffer.c index 27265a8b43c1..027ae3bdfbbd 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2114,8 +2114,8 @@ EXPORT_SYMBOL(generic_write_end); * Returns true if all buffers which correspond to a file portion * we want to read are uptodate. */ -int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, - unsigned long from) +int block_is_partially_uptodate(struct page *page, unsigned long from, + unsigned long count) { unsigned block_start, block_end, blocksize; unsigned to; @@ -2127,7 +2127,7 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, head = page_buffers(page); blocksize = head->b_size; - to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count); + to = min_t(unsigned, PAGE_CACHE_SIZE - from, count); to = from + to; if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize) return 0; diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index d77797a52b7b..c40302f909ce 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -210,8 +210,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block, int block_write_full_page_endio(struct page *page, get_block_t *get_block, struct writeback_control *wbc, bh_end_io_t *handler); int block_read_full_page(struct page*, get_block_t*); -int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc, - unsigned long from); +int block_is_partially_uptodate(struct page *page, unsigned long from, + unsigned long count); int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, get_block_t *get_block); int __block_write_begin(struct page *page, loff_t pos, unsigned len, diff --git a/include/linux/fs.h b/include/linux/fs.h index db181b542db1..ddfff2ecef0b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -385,7 +385,7 @@ struct address_space_operations { int (*migratepage) (struct address_space *, struct page *, struct page *, enum migrate_mode); int (*launder_page) (struct page *); - int (*is_partially_uptodate) (struct page *, read_descriptor_t *, + int (*is_partially_uptodate) (struct page *, unsigned long, unsigned long); void (*is_dirty_writeback) (struct page *, bool *, bool *); int (*error_remove_page)(struct address_space *, struct page *); diff --git a/mm/filemap.c b/mm/filemap.c index 7a13f6ac5421..46e98019af6c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1148,7 +1148,7 @@ find_page: if (!page->mapping) goto page_not_up_to_date_locked; if (!mapping->a_ops->is_partially_uptodate(page, - desc, offset)) + offset, desc->count)) goto page_not_up_to_date_locked; unlock_page(page); } From 9e8c2af96e0d2d5fe298dd796fb6bc16e888a48d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 2 Feb 2014 22:10:25 -0500 Subject: [PATCH 25/63] callers of iov_copy_from_user_atomic() don't need pagecache_disable() ... it does that itself (via kmap_atomic()) Signed-off-by: Al Viro --- fs/btrfs/file.c | 5 ----- fs/fuse/file.c | 2 -- mm/filemap.c | 3 --- 3 files changed, 10 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0165b8672f09..34e096201da1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -425,13 +425,8 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, struct page *page = prepared_pages[pg]; /* * Copy data from userspace to the current page - * - * Disable pagefault to avoid recursive lock since - * the pages are already locked */ - pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, count); - pagefault_enable(); /* Flush processor's dcache for this page */ flush_dcache_page(page); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 77bcc303c3ae..a91d3b4d32f3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1003,9 +1003,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - pagefault_disable(); tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); - pagefault_enable(); flush_dcache_page(page); mark_page_accessed(page); diff --git a/mm/filemap.c b/mm/filemap.c index 46e98019af6c..bfb7a97d6d0f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1974,7 +1974,6 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, char *kaddr; size_t copied; - BUG_ON(!in_atomic()); kaddr = kmap_atomic(page); if (likely(i->nr_segs == 1)) { int left; @@ -2348,9 +2347,7 @@ again: if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); - pagefault_enable(); flush_dcache_page(page); mark_page_accessed(page); From 8142c184b8f16d213eb8ba06ccb6222259a51cfc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 2 Feb 2014 22:18:22 -0500 Subject: [PATCH 26/63] do_shmem_file_read(): call file_read_actor() directly Signed-off-by: Al Viro --- mm/shmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 1f18c9d0d93e..9398e6cd48cb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1462,7 +1462,7 @@ shmem_write_end(struct file *file, struct address_space *mapping, return copied; } -static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc, read_actor_t actor) +static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc) { struct inode *inode = file_inode(filp); struct address_space *mapping = inode->i_mapping; @@ -1550,7 +1550,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ * "pos" here (the actor routine has to update the user buffer * pointers and the remaining count). */ - ret = actor(desc, page, offset, nr); + ret = file_read_actor(desc, page, offset, nr); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; @@ -1588,7 +1588,7 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb, if (desc.count == 0) continue; desc.error = 0; - do_shmem_file_read(filp, ppos, &desc, file_read_actor); + do_shmem_file_read(filp, ppos, &desc); retval += desc.written; if (desc.error) { retval = retval ?: desc.error; From 9223687863ffa63fa655f52ef64148ee08dee4d1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 27 Nov 2013 16:29:46 -0800 Subject: [PATCH 27/63] iov_iter: Move iov_iter to uio.h Signed-off-by: Kent Overstreet --- include/linux/fs.h | 32 ----------------------------- include/linux/uio.h | 50 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 32 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index ddfff2ecef0b..2261ac8f0534 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -295,38 +295,6 @@ struct page; struct address_space; struct writeback_control; -struct iov_iter { - const struct iovec *iov; - unsigned long nr_segs; - size_t iov_offset; - size_t count; -}; - -size_t iov_iter_copy_from_user_atomic(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes); -size_t iov_iter_copy_from_user(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes); -void iov_iter_advance(struct iov_iter *i, size_t bytes); -int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); -size_t iov_iter_single_seg_count(const struct iov_iter *i); - -static inline void iov_iter_init(struct iov_iter *i, - const struct iovec *iov, unsigned long nr_segs, - size_t count, size_t written) -{ - i->iov = iov; - i->nr_segs = nr_segs; - i->iov_offset = 0; - i->count = count + written; - - iov_iter_advance(i, written); -} - -static inline size_t iov_iter_count(struct iov_iter *i) -{ - return i->count; -} - /* * "descriptor" for what we're up to with a read. * This allows us to use the same read code yet diff --git a/include/linux/uio.h b/include/linux/uio.h index c55ce243cc09..347d70ce098e 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -9,14 +9,23 @@ #ifndef __LINUX_UIO_H #define __LINUX_UIO_H +#include #include +struct page; struct kvec { void *iov_base; /* and that should *never* hold a userland pointer */ size_t iov_len; }; +struct iov_iter { + const struct iovec *iov; + unsigned long nr_segs; + size_t iov_offset; + size_t count; +}; + /* * Total number of bytes covered by an iovec. * @@ -34,8 +43,49 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) return ret; } +static inline struct iovec iov_iter_iovec(const struct iov_iter *iter) +{ + return (struct iovec) { + .iov_base = iter->iov->iov_base + iter->iov_offset, + .iov_len = min(iter->count, + iter->iov->iov_len - iter->iov_offset), + }; +} + +#define iov_for_each(iov, iter, start) \ + for (iter = (start); \ + (iter).count && \ + ((iov = iov_iter_iovec(&(iter))), 1); \ + iov_iter_advance(&(iter), (iov).iov_len)) + unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to); +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes); +size_t iov_iter_copy_from_user(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes); +void iov_iter_advance(struct iov_iter *i, size_t bytes); +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); +size_t iov_iter_single_seg_count(const struct iov_iter *i); + +static inline void iov_iter_init(struct iov_iter *i, + const struct iovec *iov, unsigned long nr_segs, + size_t count, size_t written) +{ + i->iov = iov; + i->nr_segs = nr_segs; + i->iov_offset = 0; + i->count = count + written; + + iov_iter_advance(i, written); +} + +static inline size_t iov_iter_count(struct iov_iter *i) +{ + return i->count; +} + int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len); int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len); + #endif From 6e58e79db8a16222b31fc8da1ca2ac2dccfc4237 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 3 Feb 2014 17:07:03 -0500 Subject: [PATCH 28/63] introduce copy_page_to_iter, kill loop over iovec in generic_file_aio_read() generic_file_aio_read() was looping over the target iovec, with loop over (source) pages nested inside that. Just set an iov_iter up and pass *that* to do_generic_file_aio_read(). With copy_page_to_iter() doing all work of mapping and copying a page to iovec and advancing iov_iter. Switch shmem_file_aio_read() to the same and kill file_read_actor(), while we are at it. Signed-off-by: Al Viro --- include/linux/fs.h | 1 - include/linux/uio.h | 2 + mm/filemap.c | 202 ++++++++++++++++++++++++-------------------- mm/shmem.c | 77 ++++++----------- 4 files changed, 138 insertions(+), 144 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 2261ac8f0534..2a5b1744f80a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2390,7 +2390,6 @@ extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr, unsigned long size, pgoff_t pgoff); -extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, diff --git a/include/linux/uio.h b/include/linux/uio.h index 347d70ce098e..199bcc34241b 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -67,6 +67,8 @@ size_t iov_iter_copy_from_user(struct page *page, void iov_iter_advance(struct iov_iter *i, size_t bytes); int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); size_t iov_iter_single_seg_count(const struct iov_iter *i); +size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i); static inline void iov_iter_init(struct iov_iter *i, const struct iovec *iov, unsigned long nr_segs, diff --git a/mm/filemap.c b/mm/filemap.c index bfb7a97d6d0f..a16eb2c4f316 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1085,11 +1085,90 @@ static void shrink_readahead_size_eio(struct file *filp, ra->ra_pages /= 4; } +size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + void *kaddr, *from; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + if (!fault_in_pages_writeable(buf, copy)) { + kaddr = kmap_atomic(page); + from = kaddr + offset; + + /* first chunk, usually the only one */ + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + if (likely(!bytes)) { + kunmap_atomic(kaddr); + goto done; + } + offset = from - kaddr; + buf += copy; + kunmap_atomic(kaddr); + copy = min(bytes, iov->iov_len - skip); + } + /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); + from = kaddr + offset; + left = __copy_to_user(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + kunmap(page); +done: + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} +EXPORT_SYMBOL(copy_page_to_iter); + /** * do_generic_file_read - generic file read routine * @filp: the file to read * @ppos: current file position - * @desc: read_descriptor + * @iter: data destination + * @written: already copied * * This is a generic file read routine, and uses the * mapping->a_ops->readpage() function for the actual low-level stuff. @@ -1097,8 +1176,8 @@ static void shrink_readahead_size_eio(struct file *filp, * This is really ugly. But the goto's actually try to clarify some * of the logic when it comes to error handling etc. */ -static void do_generic_file_read(struct file *filp, loff_t *ppos, - read_descriptor_t *desc) +static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos, + struct iov_iter *iter, ssize_t written) { struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; @@ -1108,12 +1187,12 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos, pgoff_t prev_index; unsigned long offset; /* offset into pagecache page */ unsigned int prev_offset; - int error; + int error = 0; index = *ppos >> PAGE_CACHE_SHIFT; prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); - last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; + last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; offset = *ppos & ~PAGE_CACHE_MASK; for (;;) { @@ -1148,7 +1227,7 @@ find_page: if (!page->mapping) goto page_not_up_to_date_locked; if (!mapping->a_ops->is_partially_uptodate(page, - offset, desc->count)) + offset, iter->count)) goto page_not_up_to_date_locked; unlock_page(page); } @@ -1198,24 +1277,23 @@ page_ok: /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... - * - * The file_read_actor routine returns how many bytes were - * actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). */ - ret = file_read_actor(desc, page, offset, nr); + + ret = copy_page_to_iter(page, offset, nr, iter); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; prev_offset = offset; page_cache_release(page); - if (ret == nr && desc->count) - continue; - goto out; + written += ret; + if (!iov_iter_count(iter)) + goto out; + if (ret < nr) { + error = -EFAULT; + goto out; + } + continue; page_not_up_to_date: /* Get exclusive access to the page ... */ @@ -1250,6 +1328,7 @@ readpage: if (unlikely(error)) { if (error == AOP_TRUNCATED_PAGE) { page_cache_release(page); + error = 0; goto find_page; } goto readpage_error; @@ -1280,7 +1359,6 @@ readpage: readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ - desc->error = error; page_cache_release(page); goto out; @@ -1291,16 +1369,17 @@ no_cached_page: */ page = page_cache_alloc_cold(mapping); if (!page) { - desc->error = -ENOMEM; + error = -ENOMEM; goto out; } error = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); if (error) { page_cache_release(page); - if (error == -EEXIST) + if (error == -EEXIST) { + error = 0; goto find_page; - desc->error = error; + } goto out; } goto readpage; @@ -1313,44 +1392,7 @@ out: *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; file_accessed(filp); -} - -int file_read_actor(read_descriptor_t *desc, struct page *page, - unsigned long offset, unsigned long size) -{ - char *kaddr; - unsigned long left, count = desc->count; - - if (size > count) - size = count; - - /* - * Faults on the destination of a read are common, so do it before - * taking the kmap. - */ - if (!fault_in_pages_writeable(desc->arg.buf, size)) { - kaddr = kmap_atomic(page); - left = __copy_to_user_inatomic(desc->arg.buf, - kaddr + offset, size); - kunmap_atomic(kaddr); - if (left == 0) - goto success; - } - - /* Do it the slow way */ - kaddr = kmap(page); - left = __copy_to_user(desc->arg.buf, kaddr + offset, size); - kunmap(page); - - if (left) { - size -= left; - desc->error = -EFAULT; - } -success: - desc->count = count - size; - desc->written += size; - desc->arg.buf += size; - return size; + return written ? written : error; } /* @@ -1408,14 +1450,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, { struct file *filp = iocb->ki_filp; ssize_t retval; - unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; + struct iov_iter i; count = 0; retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (retval) return retval; + iov_iter_init(&i, iov, nr_segs, count, 0); /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (filp->f_flags & O_DIRECT) { @@ -1437,6 +1480,11 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, if (retval > 0) { *ppos = pos + retval; count -= retval; + /* + * If we did a short DIO read we need to skip the + * section of the iov that we've already read data into. + */ + iov_iter_advance(&i, retval); } /* @@ -1453,39 +1501,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, } } - count = retval; - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; - loff_t offset = 0; - - /* - * If we did a short DIO read we need to skip the section of the - * iov that we've already read data into. - */ - if (count) { - if (count > iov[seg].iov_len) { - count -= iov[seg].iov_len; - continue; - } - offset = count; - count = 0; - } - - desc.written = 0; - desc.arg.buf = iov[seg].iov_base + offset; - desc.count = iov[seg].iov_len - offset; - if (desc.count == 0) - continue; - desc.error = 0; - do_generic_file_read(filp, ppos, &desc); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; - } + retval = do_generic_file_read(filp, ppos, &i, retval); out: return retval; } diff --git a/mm/shmem.c b/mm/shmem.c index 9398e6cd48cb..17d3799d04bd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1462,13 +1462,25 @@ shmem_write_end(struct file *file, struct address_space *mapping, return copied; } -static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc) +static ssize_t shmem_file_aio_read(struct kiocb *iocb, + const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct inode *inode = file_inode(filp); + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; pgoff_t index; unsigned long offset; enum sgp_type sgp = SGP_READ; + int error; + ssize_t retval; + size_t count; + loff_t *ppos = &iocb->ki_pos; + struct iov_iter iter; + + retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); + if (retval) + return retval; + iov_iter_init(&iter, iov, nr_segs, count, 0); /* * Might this read be for a stacking filesystem? Then when reading @@ -1496,10 +1508,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ break; } - desc->error = shmem_getpage(inode, index, &page, sgp, NULL); - if (desc->error) { - if (desc->error == -EINVAL) - desc->error = 0; + error = shmem_getpage(inode, index, &page, sgp, NULL); + if (error) { + if (error == -EINVAL) + error = 0; break; } if (page) @@ -1543,61 +1555,26 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... - * - * The actor routine returns how many bytes were actually used.. - * NOTE! This may not be the same as how much of a user buffer - * we filled up (we may be padding etc), so we can only update - * "pos" here (the actor routine has to update the user buffer - * pointers and the remaining count). */ - ret = file_read_actor(desc, page, offset, nr); + ret = copy_page_to_iter(page, offset, nr, &iter); + retval += ret; offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; page_cache_release(page); - if (ret != nr || !desc->count) + if (!iov_iter_count(&iter)) break; - + if (ret < nr) { + error = -EFAULT; + break; + } cond_resched(); } *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; - file_accessed(filp); -} - -static ssize_t shmem_file_aio_read(struct kiocb *iocb, - const struct iovec *iov, unsigned long nr_segs, loff_t pos) -{ - struct file *filp = iocb->ki_filp; - ssize_t retval; - unsigned long seg; - size_t count; - loff_t *ppos = &iocb->ki_pos; - - retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); - if (retval) - return retval; - - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; - - desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; - if (desc.count == 0) - continue; - desc.error = 0; - do_shmem_file_read(filp, ppos, &desc); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; - } - return retval; + file_accessed(file); + return retval ? retval : error; } static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, From 74027f4a181754e917853bd1d2e21449f008ab39 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 4 Feb 2014 13:47:26 -0500 Subject: [PATCH 29/63] cifs_iovec_read(): resubmit shouldn't restart the loop ... by that point the request we'd just resent is in the head of the list anyway. Just return to the beginning of the loop body... Signed-off-by: Al Viro --- fs/cifs/file.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 834fce759d80..df414db74ab9 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2918,8 +2918,8 @@ error: rc = 0; /* the loop below should proceed in the order of increasing offsets */ -restart_loop: list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { + again: if (!rc) { ssize_t copied; @@ -2927,20 +2927,20 @@ restart_loop: rc = wait_for_completion_killable(&rdata->done); if (rc) rc = -EINTR; - else if (rdata->result) + else if (rdata->result) { rc = rdata->result; - else { + /* resend call if it's a retryable error */ + if (rc == -EAGAIN) { + rc = cifs_retry_async_readv(rdata); + goto again; + } + } else { rc = cifs_readdata_to_iov(rdata, iov, nr_segs, *poffset, &copied); total_read += copied; } - /* resend call if it's a retryable error */ - if (rc == -EAGAIN) { - rc = cifs_retry_async_readv(rdata); - goto restart_loop; - } } list_del_init(&rdata->list); kref_put(&rdata->refcount, cifs_uncached_readdata_release); From 637b58c2887e5e57850865839cc75f59184b23d1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 3 Feb 2014 19:11:42 -0500 Subject: [PATCH 30/63] switch pipe_read() to copy_page_to_iter() Signed-off-by: Al Viro --- fs/pipe.c | 79 ++++++------------------------------------------------- 1 file changed, 8 insertions(+), 71 deletions(-) diff --git a/fs/pipe.c b/fs/pipe.c index 6679c95eb4c3..034bffac3f97 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -142,55 +142,6 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, return 0; } -static int -pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, - int atomic) -{ - unsigned long copy; - - while (len > 0) { - while (!iov->iov_len) - iov++; - copy = min_t(unsigned long, len, iov->iov_len); - - if (atomic) { - if (__copy_to_user_inatomic(iov->iov_base, from, copy)) - return -EFAULT; - } else { - if (copy_to_user(iov->iov_base, from, copy)) - return -EFAULT; - } - from += copy; - len -= copy; - iov->iov_base += copy; - iov->iov_len -= copy; - } - return 0; -} - -/* - * Attempt to pre-fault in the user memory, so we can use atomic copies. - * Returns the number of bytes not faulted in. - */ -static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) -{ - while (!iov->iov_len) - iov++; - - while (len > 0) { - unsigned long this_len; - - this_len = min_t(unsigned long, len, iov->iov_len); - if (fault_in_pages_writeable(iov->iov_base, this_len)) - break; - - len -= this_len; - iov++; - } - - return len; -} - /* * Pre-fault in the user memory, so we can use atomic copies. */ @@ -329,12 +280,15 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, ssize_t ret; struct iovec *iov = (struct iovec *)_iov; size_t total_len; + struct iov_iter iter; total_len = iov_length(iov, nr_segs); /* Null read succeeds. */ if (unlikely(total_len == 0)) return 0; + iov_iter_init(&iter, iov, nr_segs, total_len, 0); + do_wakeup = 0; ret = 0; __pipe_lock(pipe); @@ -344,9 +298,9 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, int curbuf = pipe->curbuf; struct pipe_buffer *buf = pipe->bufs + curbuf; const struct pipe_buf_operations *ops = buf->ops; - void *addr; size_t chars = buf->len; - int error, atomic; + size_t written; + int error; if (chars > total_len) chars = total_len; @@ -358,27 +312,10 @@ pipe_read(struct kiocb *iocb, const struct iovec *_iov, break; } - atomic = !iov_fault_in_pages_write(iov, chars); -redo: - if (atomic) - addr = kmap_atomic(buf->page); - else - addr = kmap(buf->page); - error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); - if (atomic) - kunmap_atomic(addr); - else - kunmap(buf->page); - if (unlikely(error)) { - /* - * Just retry with the slow path if we failed. - */ - if (atomic) { - atomic = 0; - goto redo; - } + written = copy_page_to_iter(buf->page, buf->offset, chars, &iter); + if (unlikely(written < chars)) { if (!ret) - ret = error; + ret = -EFAULT; break; } ret += chars; From 6130f5315ee80a591285a25957af71621bd0f17e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 3 Feb 2014 18:19:51 -0500 Subject: [PATCH 31/63] switch vmsplice_to_user() to copy_page_to_iter() I've switched the sanity checks on iovec to rw_copy_check_uvector(); we might need to do a local analog, if any behaviour differences are not actually bugfixes here... Signed-off-by: Al Viro --- fs/splice.c | 114 +++++++++++----------------------------------------- 1 file changed, 23 insertions(+), 91 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index ca3bfbd970f3..9bc07d2b53cf 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1520,116 +1520,48 @@ static int get_iovec_page_array(const struct iovec __user *iov, static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { - char *src; - int ret; - - /* - * See if we can use the atomic maps, by prefaulting in the - * pages and doing an atomic copy - */ - if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { - src = kmap_atomic(buf->page); - ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, - sd->len); - kunmap_atomic(src); - if (!ret) { - ret = sd->len; - goto out; - } - } - - /* - * No dice, use slow non-atomic map and copy - */ - src = kmap(buf->page); - - ret = sd->len; - if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) - ret = -EFAULT; - - kunmap(buf->page); -out: - if (ret > 0) - sd->u.userptr += ret; - return ret; + int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); + return n == sd->len ? n : -EFAULT; } /* * For lack of a better implementation, implement vmsplice() to userspace * as a simple copy of the pipes pages to the user iov. */ -static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, +static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, unsigned long nr_segs, unsigned int flags) { struct pipe_inode_info *pipe; struct splice_desc sd; - ssize_t size; - int error; long ret; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t count = 0; pipe = get_pipe_info(file); if (!pipe) return -EBADF; + ret = rw_copy_check_uvector(READ, uiov, nr_segs, + ARRAY_SIZE(iovstack), iovstack, &iov); + if (ret <= 0) + return ret; + + iov_iter_init(&iter, iov, nr_segs, count, 0); + + sd.len = 0; + sd.total_len = count; + sd.flags = flags; + sd.u.data = &iter; + sd.pos = 0; + pipe_lock(pipe); - - error = ret = 0; - while (nr_segs) { - void __user *base; - size_t len; - - /* - * Get user address base and length for this iovec. - */ - error = get_user(base, &iov->iov_base); - if (unlikely(error)) - break; - error = get_user(len, &iov->iov_len); - if (unlikely(error)) - break; - - /* - * Sanity check this iovec. 0 read succeeds. - */ - if (unlikely(!len)) - break; - if (unlikely(!base)) { - error = -EFAULT; - break; - } - - if (unlikely(!access_ok(VERIFY_WRITE, base, len))) { - error = -EFAULT; - break; - } - - sd.len = 0; - sd.total_len = len; - sd.flags = flags; - sd.u.userptr = base; - sd.pos = 0; - - size = __splice_from_pipe(pipe, &sd, pipe_to_user); - if (size < 0) { - if (!ret) - ret = size; - - break; - } - - ret += size; - - if (size < len) - break; - - nr_segs--; - iov++; - } - + ret = __splice_from_pipe(pipe, &sd, pipe_to_user); pipe_unlock(pipe); - if (!ret) - ret = error; + if (iov != iovstack) + kfree(iov); return ret; } From 7f25bba819a38ab7310024a9350655f374707e20 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 4 Feb 2014 14:07:43 -0500 Subject: [PATCH 32/63] cifs_iovec_read: keep iov_iter between the calls of cifs_readdata_to_iov() ... we are doing them on adjacent parts of file, so what happens is that each subsequent call works to rebuild the iov_iter to exact state it had been abandoned in by previous one. Just keep it through the entire cifs_iovec_read(). And use copy_page_to_iter() instead of doing kmap/copy_to_user/kunmap manually... Signed-off-by: Al Viro --- fs/cifs/file.c | 62 ++++++++++++++------------------------------------ 1 file changed, 17 insertions(+), 45 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index df414db74ab9..ad63e4740aff 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2727,56 +2727,27 @@ cifs_retry_async_readv(struct cifs_readdata *rdata) /** * cifs_readdata_to_iov - copy data from pages in response to an iovec * @rdata: the readdata response with list of pages holding data - * @iov: vector in which we should copy the data - * @nr_segs: number of segments in vector - * @offset: offset into file of the first iovec - * @copied: used to return the amount of data copied to the iov + * @iter: destination for our data * * This function copies data from a list of pages in a readdata response into * an array of iovecs. It will first calculate where the data should go * based on the info in the readdata and then copy the data into that spot. */ -static ssize_t -cifs_readdata_to_iov(struct cifs_readdata *rdata, const struct iovec *iov, - unsigned long nr_segs, loff_t offset, ssize_t *copied) +static int +cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) { - int rc = 0; - struct iov_iter ii; - size_t pos = rdata->offset - offset; - ssize_t remaining = rdata->bytes; - unsigned char *pdata; + size_t remaining = rdata->bytes; unsigned int i; - /* set up iov_iter and advance to the correct offset */ - iov_iter_init(&ii, iov, nr_segs, iov_length(iov, nr_segs), 0); - iov_iter_advance(&ii, pos); - - *copied = 0; for (i = 0; i < rdata->nr_pages; i++) { - ssize_t copy; struct page *page = rdata->pages[i]; - - /* copy a whole page or whatever's left */ - copy = min_t(ssize_t, remaining, PAGE_SIZE); - - /* ...but limit it to whatever space is left in the iov */ - copy = min_t(ssize_t, copy, iov_iter_count(&ii)); - - /* go while there's data to be copied and no errors */ - if (copy && !rc) { - pdata = kmap(page); - rc = memcpy_toiovecend(ii.iov, pdata, ii.iov_offset, - (int)copy); - kunmap(page); - if (!rc) { - *copied += copy; - remaining -= copy; - iov_iter_advance(&ii, copy); - } - } + size_t copy = min(remaining, PAGE_SIZE); + size_t written = copy_page_to_iter(page, 0, copy, iter); + remaining -= written; + if (written < copy && iov_iter_count(iter) > 0) + break; } - - return rc; + return remaining ? -EFAULT : 0; } static void @@ -2851,6 +2822,7 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, struct cifsFileInfo *open_file; struct cifs_readdata *rdata, *tmp; struct list_head rdata_list; + struct iov_iter to; pid_t pid; if (!nr_segs) @@ -2860,6 +2832,8 @@ cifs_iovec_read(struct file *file, const struct iovec *iov, if (!len) return 0; + iov_iter_init(&to, iov, nr_segs, len, 0); + INIT_LIST_HEAD(&rdata_list); cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); open_file = file->private_data; @@ -2917,12 +2891,11 @@ error: if (!list_empty(&rdata_list)) rc = 0; + len = iov_iter_count(&to); /* the loop below should proceed in the order of increasing offsets */ list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { again: if (!rc) { - ssize_t copied; - /* FIXME: freezable sleep too? */ rc = wait_for_completion_killable(&rdata->done); if (rc) @@ -2935,10 +2908,7 @@ error: goto again; } } else { - rc = cifs_readdata_to_iov(rdata, iov, - nr_segs, *poffset, - &copied); - total_read += copied; + rc = cifs_readdata_to_iov(rdata, &to); } } @@ -2946,6 +2916,8 @@ error: kref_put(&rdata->refcount, cifs_uncached_readdata_release); } + total_read = len - iov_iter_count(&to); + cifs_stats_bytes_read(tcon, total_read); *poffset += total_read; From 0165e8100be3b2b81f747ebc25418656404c61b8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 4 Feb 2014 14:19:48 -0500 Subject: [PATCH 33/63] fold cifs_iovec_read() into its (only) caller Signed-off-by: Al Viro --- fs/cifs/file.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ad63e4740aff..3443b8f8e1c0 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2808,14 +2808,14 @@ cifs_uncached_read_into_pages(struct TCP_Server_Info *server, return total_read > 0 ? total_read : result; } -static ssize_t -cifs_iovec_read(struct file *file, const struct iovec *iov, - unsigned long nr_segs, loff_t *poffset) +ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { + struct file *file = iocb->ki_filp; ssize_t rc; size_t len, cur_len; ssize_t total_read = 0; - loff_t offset = *poffset; + loff_t offset = pos; unsigned int npages; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; @@ -2919,25 +2919,16 @@ error: total_read = len - iov_iter_count(&to); cifs_stats_bytes_read(tcon, total_read); - *poffset += total_read; /* mask nodata case */ if (rc == -ENODATA) rc = 0; - return total_read ? total_read : rc; -} - -ssize_t cifs_user_readv(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - ssize_t read; - - read = cifs_iovec_read(iocb->ki_filp, iov, nr_segs, &pos); - if (read > 0) - iocb->ki_pos = pos; - - return read; + if (total_read) { + iocb->ki_pos = pos + total_read; + return total_read; + } + return rc; } ssize_t From ec69557982563c97b3a7d68dd271be5105b83869 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 4 Feb 2014 19:08:21 -0500 Subject: [PATCH 34/63] read_code(): go through vfs_read() instead of calling the method directly ... and don't skip on sanity checks. It's *not* a hot path, TYVM (a couple of calls per a.out execve(), for pity sake) and headers of random a.out binary are not to be trusted. Signed-off-by: Al Viro --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 3d78fccdd723..4cc94534ed5b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -810,7 +810,7 @@ EXPORT_SYMBOL(kernel_read); ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { - ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos); + ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) flush_icache_range(addr, addr + len); return res; From 480402e18def5514c9dc8cb04e3c0e7482ff2b86 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Feb 2014 10:14:01 -0500 Subject: [PATCH 35/63] untangling process_vm_..., part 1 we want to massage it to use of iov_iter. This one is an equivalent transformation - just introduce a local variable mirroring lvec + *lvec_current. Signed-off-by: Al Viro --- mm/process_vm_access.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index fd26d0433509..7b8d63e6c30b 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -60,6 +60,7 @@ static int process_vm_rw_pages(struct task_struct *task, int ret; ssize_t bytes_to_copy; ssize_t rc = 0; + const struct iovec *iov = lvec + *lvec_current; *bytes_copied = 0; @@ -81,8 +82,10 @@ static int process_vm_rw_pages(struct task_struct *task, pgs_copied++) { /* Make sure we have a non zero length iovec */ while (*lvec_current < lvec_cnt - && lvec[*lvec_current].iov_len == 0) + && iov->iov_len == 0) { + iov++; (*lvec_current)++; + } if (*lvec_current == lvec_cnt) break; @@ -94,18 +97,18 @@ static int process_vm_rw_pages(struct task_struct *task, bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, len - *bytes_copied); bytes_to_copy = min_t(ssize_t, bytes_to_copy, - lvec[*lvec_current].iov_len + iov->iov_len - *lvec_offset); target_kaddr = kmap(process_pages[pgs_copied]) + start_offset; if (vm_write) ret = copy_from_user(target_kaddr, - lvec[*lvec_current].iov_base + iov->iov_base + *lvec_offset, bytes_to_copy); else - ret = copy_to_user(lvec[*lvec_current].iov_base + ret = copy_to_user(iov->iov_base + *lvec_offset, target_kaddr, bytes_to_copy); kunmap(process_pages[pgs_copied]); @@ -117,12 +120,13 @@ static int process_vm_rw_pages(struct task_struct *task, } *bytes_copied += bytes_to_copy; *lvec_offset += bytes_to_copy; - if (*lvec_offset == lvec[*lvec_current].iov_len) { + if (*lvec_offset == iov->iov_len) { /* * Need to copy remaining part of page into the * next iovec if there are any bytes left in page */ (*lvec_current)++; + iov++; *lvec_offset = 0; start_offset = (start_offset + bytes_to_copy) % PAGE_SIZE; From c61c70384fad407bfcb066c3fb9271164d630212 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Feb 2014 10:22:50 -0500 Subject: [PATCH 36/63] untangling process_vm_..., part 2 move iov to caller's stack frame; the value we assign to it on the next call of process_vm_rw_pages() is equal to the value it had when the last time we were leaving process_vm_rw_pages(). drop lvec argument of process_vm_rw_pages() - it's not used anymore. Signed-off-by: Al Viro --- mm/process_vm_access.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 7b8d63e6c30b..186ec5db6090 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -45,7 +45,7 @@ static int process_vm_rw_pages(struct task_struct *task, unsigned long pa, unsigned long start_offset, unsigned long len, - const struct iovec *lvec, + const struct iovec **iovp, unsigned long lvec_cnt, unsigned long *lvec_current, size_t *lvec_offset, @@ -60,7 +60,7 @@ static int process_vm_rw_pages(struct task_struct *task, int ret; ssize_t bytes_to_copy; ssize_t rc = 0; - const struct iovec *iov = lvec + *lvec_current; + const struct iovec *iov = *iovp; *bytes_copied = 0; @@ -149,6 +149,7 @@ end: put_page(process_pages[j]); } + *iovp = iov; return rc; } @@ -192,6 +193,7 @@ static int process_vm_rw_single_vec(unsigned long addr, unsigned long nr_pages_to_copy; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); + const struct iovec *iov = lvec + *lvec_current; *bytes_copied = 0; @@ -206,7 +208,7 @@ static int process_vm_rw_single_vec(unsigned long addr, rc = process_vm_rw_pages(task, mm, process_pages, pa, start_offset, len, - lvec, lvec_cnt, + &iov, lvec_cnt, lvec_current, lvec_offset, vm_write, nr_pages_to_copy, &bytes_copied_loop); From 12e3004e464f554e0d57ddd0b4185060275e1f12 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Feb 2014 10:28:58 -0500 Subject: [PATCH 37/63] untangling process_vm_..., part 3 lift iov one more level out - from process_vm_rw_single_vec to process_vm_rw_core(). Same story as with the previous commit. Signed-off-by: Al Viro --- mm/process_vm_access.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 186ec5db6090..40dfb396e8ab 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -174,7 +174,7 @@ end: */ static int process_vm_rw_single_vec(unsigned long addr, unsigned long len, - const struct iovec *lvec, + const struct iovec **iovp, unsigned long lvec_cnt, unsigned long *lvec_current, size_t *lvec_offset, @@ -193,7 +193,6 @@ static int process_vm_rw_single_vec(unsigned long addr, unsigned long nr_pages_to_copy; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); - const struct iovec *iov = lvec + *lvec_current; *bytes_copied = 0; @@ -208,7 +207,7 @@ static int process_vm_rw_single_vec(unsigned long addr, rc = process_vm_rw_pages(task, mm, process_pages, pa, start_offset, len, - &iov, lvec_cnt, + iovp, lvec_cnt, lvec_current, lvec_offset, vm_write, nr_pages_to_copy, &bytes_copied_loop); @@ -319,7 +318,7 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, - lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset, + &lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset, process_pages, mm, task, vm_write, &bytes_copied_loop); bytes_copied += bytes_copied_loop; if (rc != 0) { From 1291afc18115885cf3ff265a5dc836393060e820 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Feb 2014 10:40:15 -0500 Subject: [PATCH 38/63] untangling process_vm_..., part 4 instead of passing vector size (by value) and index (by reference), pass the number of elements remaining. That's all we care about in these functions by that point. Signed-off-by: Al Viro --- mm/process_vm_access.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 40dfb396e8ab..4bbd495fcedd 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -46,8 +46,7 @@ static int process_vm_rw_pages(struct task_struct *task, unsigned long start_offset, unsigned long len, const struct iovec **iovp, - unsigned long lvec_cnt, - unsigned long *lvec_current, + unsigned long *left, size_t *lvec_offset, int vm_write, unsigned int nr_pages_to_copy, @@ -78,15 +77,14 @@ static int process_vm_rw_pages(struct task_struct *task, /* Do the copy for each page */ for (pgs_copied = 0; - (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt); + (pgs_copied < nr_pages_to_copy) && *left; pgs_copied++) { /* Make sure we have a non zero length iovec */ - while (*lvec_current < lvec_cnt - && iov->iov_len == 0) { + while (*left && iov->iov_len == 0) { iov++; - (*lvec_current)++; + (*left)--; } - if (*lvec_current == lvec_cnt) + if (!*left) break; /* @@ -125,7 +123,7 @@ static int process_vm_rw_pages(struct task_struct *task, * Need to copy remaining part of page into the * next iovec if there are any bytes left in page */ - (*lvec_current)++; + (*left)--; iov++; *lvec_offset = 0; start_offset = (start_offset + bytes_to_copy) @@ -175,8 +173,7 @@ end: static int process_vm_rw_single_vec(unsigned long addr, unsigned long len, const struct iovec **iovp, - unsigned long lvec_cnt, - unsigned long *lvec_current, + unsigned long *left, size_t *lvec_offset, struct page **process_pages, struct mm_struct *mm, @@ -201,14 +198,14 @@ static int process_vm_rw_single_vec(unsigned long addr, return 0; nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; - while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) { + while ((nr_pages_copied < nr_pages) && *left) { nr_pages_to_copy = min(nr_pages - nr_pages_copied, max_pages_per_loop); rc = process_vm_rw_pages(task, mm, process_pages, pa, start_offset, len, - iovp, lvec_cnt, - lvec_current, lvec_offset, + iovp, left, + lvec_offset, vm_write, nr_pages_to_copy, &bytes_copied_loop); start_offset = 0; @@ -259,7 +256,7 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, ssize_t bytes_copied = 0; unsigned long nr_pages = 0; unsigned long nr_pages_iov; - unsigned long iov_l_curr_idx = 0; + unsigned long left = liovcnt; size_t iov_l_curr_offset = 0; ssize_t iov_len; @@ -315,10 +312,10 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, goto put_task_struct; } - for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { + for (i = 0; i < riovcnt && left; i++) { rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, - &lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset, + &lvec, &left, &iov_l_curr_offset, process_pages, mm, task, vm_write, &bytes_copied_loop); bytes_copied += bytes_copied_loop; if (rc != 0) { From 9f78bdfabf0de0bc8d3cc97d06908651b577a567 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Feb 2014 11:51:53 -0500 Subject: [PATCH 39/63] process_vm_access: switch to iov_iter instead of keeping its pieces in separate variables and passing pointers to all of them... Signed-off-by: Al Viro --- mm/process_vm_access.c | 62 +++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 34 deletions(-) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 4bbd495fcedd..1e3c81103b2c 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -45,9 +45,7 @@ static int process_vm_rw_pages(struct task_struct *task, unsigned long pa, unsigned long start_offset, unsigned long len, - const struct iovec **iovp, - unsigned long *left, - size_t *lvec_offset, + struct iov_iter *iter, int vm_write, unsigned int nr_pages_to_copy, ssize_t *bytes_copied) @@ -59,7 +57,7 @@ static int process_vm_rw_pages(struct task_struct *task, int ret; ssize_t bytes_to_copy; ssize_t rc = 0; - const struct iovec *iov = *iovp; + const struct iovec *iov = iter->iov; *bytes_copied = 0; @@ -77,14 +75,14 @@ static int process_vm_rw_pages(struct task_struct *task, /* Do the copy for each page */ for (pgs_copied = 0; - (pgs_copied < nr_pages_to_copy) && *left; + (pgs_copied < nr_pages_to_copy) && iter->nr_segs; pgs_copied++) { /* Make sure we have a non zero length iovec */ - while (*left && iov->iov_len == 0) { + while (iter->nr_segs && iov->iov_len == 0) { iov++; - (*left)--; + iter->nr_segs--; } - if (!*left) + if (!iter->nr_segs) break; /* @@ -96,18 +94,18 @@ static int process_vm_rw_pages(struct task_struct *task, len - *bytes_copied); bytes_to_copy = min_t(ssize_t, bytes_to_copy, iov->iov_len - - *lvec_offset); + - iter->iov_offset); target_kaddr = kmap(process_pages[pgs_copied]) + start_offset; if (vm_write) ret = copy_from_user(target_kaddr, iov->iov_base - + *lvec_offset, + + iter->iov_offset, bytes_to_copy); else ret = copy_to_user(iov->iov_base - + *lvec_offset, + + iter->iov_offset, target_kaddr, bytes_to_copy); kunmap(process_pages[pgs_copied]); if (ret) { @@ -117,15 +115,15 @@ static int process_vm_rw_pages(struct task_struct *task, goto end; } *bytes_copied += bytes_to_copy; - *lvec_offset += bytes_to_copy; - if (*lvec_offset == iov->iov_len) { + iter->iov_offset += bytes_to_copy; + if (iter->iov_offset == iov->iov_len) { /* * Need to copy remaining part of page into the * next iovec if there are any bytes left in page */ - (*left)--; + iter->nr_segs--; iov++; - *lvec_offset = 0; + iter->iov_offset = 0; start_offset = (start_offset + bytes_to_copy) % PAGE_SIZE; if (start_offset) @@ -147,7 +145,7 @@ end: put_page(process_pages[j]); } - *iovp = iov; + iter->iov = iov; return rc; } @@ -172,9 +170,7 @@ end: */ static int process_vm_rw_single_vec(unsigned long addr, unsigned long len, - const struct iovec **iovp, - unsigned long *left, - size_t *lvec_offset, + struct iov_iter *iter, struct page **process_pages, struct mm_struct *mm, struct task_struct *task, @@ -198,14 +194,12 @@ static int process_vm_rw_single_vec(unsigned long addr, return 0; nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; - while ((nr_pages_copied < nr_pages) && *left) { + while ((nr_pages_copied < nr_pages) && iter->nr_segs) { nr_pages_to_copy = min(nr_pages - nr_pages_copied, max_pages_per_loop); rc = process_vm_rw_pages(task, mm, process_pages, pa, - start_offset, len, - iovp, left, - lvec_offset, + start_offset, len, iter, vm_write, nr_pages_to_copy, &bytes_copied_loop); start_offset = 0; @@ -240,8 +234,7 @@ static int process_vm_rw_single_vec(unsigned long addr, * return less bytes than expected if an error occurs during the copying * process. */ -static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, - unsigned long liovcnt, +static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, const struct iovec *rvec, unsigned long riovcnt, unsigned long flags, int vm_write) @@ -256,8 +249,6 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, ssize_t bytes_copied = 0; unsigned long nr_pages = 0; unsigned long nr_pages_iov; - unsigned long left = liovcnt; - size_t iov_l_curr_offset = 0; ssize_t iov_len; /* @@ -312,11 +303,11 @@ static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, goto put_task_struct; } - for (i = 0; i < riovcnt && left; i++) { + for (i = 0; i < riovcnt && iter->nr_segs; i++) { rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, - &lvec, &left, &iov_l_curr_offset, - process_pages, mm, task, vm_write, &bytes_copied_loop); + iter, process_pages, mm, task, vm_write, + &bytes_copied_loop); bytes_copied += bytes_copied_loop; if (rc != 0) { /* If we have managed to copy any data at all then @@ -365,6 +356,7 @@ static ssize_t process_vm_rw(pid_t pid, struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; struct iovec *iov_r = iovstack_r; + struct iov_iter iter; ssize_t rc; if (flags != 0) @@ -380,13 +372,14 @@ static ssize_t process_vm_rw(pid_t pid, if (rc <= 0) goto free_iovecs; + iov_iter_init(&iter, iov_l, liovcnt, rc, 0); + rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, iovstack_r, &iov_r); if (rc <= 0) goto free_iovecs; - rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, - vm_write); + rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); free_iovecs: if (iov_r != iovstack_r) @@ -426,6 +419,7 @@ compat_process_vm_rw(compat_pid_t pid, struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; struct iovec *iov_r = iovstack_r; + struct iov_iter iter; ssize_t rc = -EFAULT; if (flags != 0) @@ -441,14 +435,14 @@ compat_process_vm_rw(compat_pid_t pid, &iov_l); if (rc <= 0) goto free_iovecs; + iov_iter_init(&iter, iov_l, liovcnt, rc, 0); rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, iovstack_r, &iov_r); if (rc <= 0) goto free_iovecs; - rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, - vm_write); + rc = process_vm_rw_core(pid, &iter, iov_r, riovcnt, flags, vm_write); free_iovecs: if (iov_r != iovstack_r) From 240f3905f513cd5f27cc3f79bf03bf3f88323f41 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Feb 2014 12:14:11 -0500 Subject: [PATCH 40/63] process_vm_access: switch to copy_page_to_iter/iov_iter_copy_from_user ... rather than open-coding those. As a side benefit, we get much saner loop calling those; we can just feed entire pages, instead of the "copy would span the iovec boundary, let's do it in two loop iterations" mess. Signed-off-by: Al Viro --- mm/process_vm_access.c | 91 +++++++++++------------------------------- 1 file changed, 23 insertions(+), 68 deletions(-) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 1e3c81103b2c..70ec3156f45a 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -51,13 +51,11 @@ static int process_vm_rw_pages(struct task_struct *task, ssize_t *bytes_copied) { int pages_pinned; - void *target_kaddr; int pgs_copied = 0; int j; int ret; ssize_t bytes_to_copy; ssize_t rc = 0; - const struct iovec *iov = iter->iov; *bytes_copied = 0; @@ -75,77 +73,34 @@ static int process_vm_rw_pages(struct task_struct *task, /* Do the copy for each page */ for (pgs_copied = 0; - (pgs_copied < nr_pages_to_copy) && iter->nr_segs; + (pgs_copied < nr_pages_to_copy) && iov_iter_count(iter); pgs_copied++) { - /* Make sure we have a non zero length iovec */ - while (iter->nr_segs && iov->iov_len == 0) { - iov++; - iter->nr_segs--; - } - if (!iter->nr_segs) - break; + struct page *page = process_pages[pgs_copied]; + bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, len); - /* - * Will copy smallest of: - * - bytes remaining in page - * - bytes remaining in destination iovec - */ - bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, - len - *bytes_copied); - bytes_to_copy = min_t(ssize_t, bytes_to_copy, - iov->iov_len - - iter->iov_offset); - - target_kaddr = kmap(process_pages[pgs_copied]) + start_offset; - - if (vm_write) - ret = copy_from_user(target_kaddr, - iov->iov_base - + iter->iov_offset, - bytes_to_copy); - else - ret = copy_to_user(iov->iov_base - + iter->iov_offset, - target_kaddr, bytes_to_copy); - kunmap(process_pages[pgs_copied]); - if (ret) { - *bytes_copied += bytes_to_copy - ret; - pgs_copied++; - rc = -EFAULT; - goto end; - } - *bytes_copied += bytes_to_copy; - iter->iov_offset += bytes_to_copy; - if (iter->iov_offset == iov->iov_len) { - /* - * Need to copy remaining part of page into the - * next iovec if there are any bytes left in page - */ - iter->nr_segs--; - iov++; - iter->iov_offset = 0; - start_offset = (start_offset + bytes_to_copy) - % PAGE_SIZE; - if (start_offset) - pgs_copied--; + if (vm_write) { + if (bytes_to_copy > iov_iter_count(iter)) + bytes_to_copy = iov_iter_count(iter); + ret = iov_iter_copy_from_user(page, + iter, start_offset, bytes_to_copy); + iov_iter_advance(iter, ret); + set_page_dirty_lock(page); } else { - start_offset = 0; + ret = copy_page_to_iter(page, start_offset, + bytes_to_copy, iter); } + *bytes_copied += ret; + len -= ret; + if (ret < bytes_to_copy && iov_iter_count(iter)) { + rc = -EFAULT; + break; + } + start_offset = 0; } end: - if (vm_write) { - for (j = 0; j < pages_pinned; j++) { - if (j < pgs_copied) - set_page_dirty_lock(process_pages[j]); - put_page(process_pages[j]); - } - } else { - for (j = 0; j < pages_pinned; j++) - put_page(process_pages[j]); - } - - iter->iov = iov; + for (j = 0; j < pages_pinned; j++) + put_page(process_pages[j]); return rc; } @@ -194,7 +149,7 @@ static int process_vm_rw_single_vec(unsigned long addr, return 0; nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; - while ((nr_pages_copied < nr_pages) && iter->nr_segs) { + while ((nr_pages_copied < nr_pages) && iov_iter_count(iter)) { nr_pages_to_copy = min(nr_pages - nr_pages_copied, max_pages_per_loop); @@ -303,7 +258,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, goto put_task_struct; } - for (i = 0; i < riovcnt && iter->nr_segs; i++) { + for (i = 0; i < riovcnt && iov_iter_count(iter); i++) { rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, iter, process_pages, mm, task, vm_write, From 70eca12d80a8b51800b4ebfbc629e0ee34166779 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Feb 2014 12:44:24 -0500 Subject: [PATCH 41/63] process_vm_access: take get_user_pages/put_pages one level up ... and trim the fuck out of process_vm_rw_pages() argument list. Signed-off-by: Al Viro --- mm/process_vm_access.c | 97 +++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 58 deletions(-) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 70ec3156f45a..c2a1916bacbf 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -39,69 +39,39 @@ * @bytes_copied: returns number of bytes successfully copied * Returns 0 on success, error code otherwise */ -static int process_vm_rw_pages(struct task_struct *task, - struct mm_struct *mm, - struct page **process_pages, - unsigned long pa, - unsigned long start_offset, +static int process_vm_rw_pages(struct page **pages, + unsigned offset, unsigned long len, struct iov_iter *iter, int vm_write, unsigned int nr_pages_to_copy, ssize_t *bytes_copied) { - int pages_pinned; - int pgs_copied = 0; - int j; - int ret; - ssize_t bytes_to_copy; - ssize_t rc = 0; - *bytes_copied = 0; - /* Get the pages we're interested in */ - down_read(&mm->mmap_sem); - pages_pinned = get_user_pages(task, mm, pa, - nr_pages_to_copy, - vm_write, 0, process_pages, NULL); - up_read(&mm->mmap_sem); - - if (pages_pinned != nr_pages_to_copy) { - rc = -EFAULT; - goto end; - } - /* Do the copy for each page */ - for (pgs_copied = 0; - (pgs_copied < nr_pages_to_copy) && iov_iter_count(iter); - pgs_copied++) { - struct page *page = process_pages[pgs_copied]; - bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, len); + while (iov_iter_count(iter) && nr_pages_to_copy--) { + struct page *page = *pages++; + size_t copy = min_t(ssize_t, PAGE_SIZE - offset, len); + size_t copied; if (vm_write) { - if (bytes_to_copy > iov_iter_count(iter)) - bytes_to_copy = iov_iter_count(iter); - ret = iov_iter_copy_from_user(page, - iter, start_offset, bytes_to_copy); - iov_iter_advance(iter, ret); + if (copy > iov_iter_count(iter)) + copy = iov_iter_count(iter); + copied = iov_iter_copy_from_user(page, iter, + offset, copy); + iov_iter_advance(iter, copied); set_page_dirty_lock(page); } else { - ret = copy_page_to_iter(page, start_offset, - bytes_to_copy, iter); + copied = copy_page_to_iter(page, offset, copy, iter); } - *bytes_copied += ret; - len -= ret; - if (ret < bytes_to_copy && iov_iter_count(iter)) { - rc = -EFAULT; - break; - } - start_offset = 0; + *bytes_copied += copied; + len -= copied; + if (copied < copy && iov_iter_count(iter)) + return -EFAULT; + offset = 0; } - -end: - for (j = 0; j < pages_pinned; j++) - put_page(process_pages[j]); - return rc; + return 0; } /* Maximum number of pages kmalloc'd to hold struct page's during copy */ @@ -138,7 +108,6 @@ static int process_vm_rw_single_vec(unsigned long addr, ssize_t bytes_copied_loop; ssize_t rc = 0; unsigned long nr_pages_copied = 0; - unsigned long nr_pages_to_copy; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); @@ -150,23 +119,35 @@ static int process_vm_rw_single_vec(unsigned long addr, nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; while ((nr_pages_copied < nr_pages) && iov_iter_count(iter)) { + int nr_pages_to_copy; + int pages_pinned; nr_pages_to_copy = min(nr_pages - nr_pages_copied, max_pages_per_loop); - rc = process_vm_rw_pages(task, mm, process_pages, pa, + /* Get the pages we're interested in */ + down_read(&mm->mmap_sem); + pages_pinned = get_user_pages(task, mm, pa, + nr_pages_to_copy, + vm_write, 0, process_pages, NULL); + up_read(&mm->mmap_sem); + + if (pages_pinned <= 0) + return -EFAULT; + + rc = process_vm_rw_pages(process_pages, start_offset, len, iter, - vm_write, nr_pages_to_copy, + vm_write, pages_pinned, &bytes_copied_loop); start_offset = 0; *bytes_copied += bytes_copied_loop; + len -= bytes_copied_loop; + nr_pages_copied += pages_pinned; + pa += pages_pinned * PAGE_SIZE; + while (pages_pinned) + put_page(process_pages[--pages_pinned]); - if (rc < 0) { - return rc; - } else { - len -= bytes_copied_loop; - nr_pages_copied += nr_pages_to_copy; - pa += nr_pages_to_copy * PAGE_SIZE; - } + if (rc < 0) + break; } return rc; From e21345f9c3f8a806604910cae886e471a860ab86 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Feb 2014 12:55:11 -0500 Subject: [PATCH 42/63] process_vm_rw_pages(): pass accurate amount of bytes ... makes passing the amount of pages unnecessary Signed-off-by: Al Viro --- mm/process_vm_access.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index c2a1916bacbf..83252d783482 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -41,20 +41,22 @@ */ static int process_vm_rw_pages(struct page **pages, unsigned offset, - unsigned long len, + size_t len, struct iov_iter *iter, int vm_write, - unsigned int nr_pages_to_copy, ssize_t *bytes_copied) { *bytes_copied = 0; /* Do the copy for each page */ - while (iov_iter_count(iter) && nr_pages_to_copy--) { + while (iov_iter_count(iter) && len) { struct page *page = *pages++; - size_t copy = min_t(ssize_t, PAGE_SIZE - offset, len); + size_t copy = PAGE_SIZE - offset; size_t copied; + if (copy > len) + copy = len; + if (vm_write) { if (copy > iov_iter_count(iter)) copy = iov_iter_count(iter); @@ -121,6 +123,7 @@ static int process_vm_rw_single_vec(unsigned long addr, while ((nr_pages_copied < nr_pages) && iov_iter_count(iter)) { int nr_pages_to_copy; int pages_pinned; + size_t n; nr_pages_to_copy = min(nr_pages - nr_pages_copied, max_pages_per_loop); @@ -134,18 +137,21 @@ static int process_vm_rw_single_vec(unsigned long addr, if (pages_pinned <= 0) return -EFAULT; + n = pages_pinned * PAGE_SIZE - start_offset; + if (n > len) + n = len; + rc = process_vm_rw_pages(process_pages, - start_offset, len, iter, - vm_write, pages_pinned, + start_offset, n, iter, + vm_write, &bytes_copied_loop); + len -= n; start_offset = 0; *bytes_copied += bytes_copied_loop; - len -= bytes_copied_loop; nr_pages_copied += pages_pinned; pa += pages_pinned * PAGE_SIZE; while (pages_pinned) put_page(process_pages[--pages_pinned]); - if (rc < 0) break; } From 9acc1a0f9af244826de0eea2c38b78410eb0df6d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Feb 2014 13:15:28 -0500 Subject: [PATCH 43/63] process_vm_access: don't bother with returning the amounts of bytes copied we can calculate that in the caller just fine, TYVM Signed-off-by: Al Viro --- mm/process_vm_access.c | 47 ++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 31 deletions(-) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 83252d783482..15e4c2f312e8 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -43,13 +43,10 @@ static int process_vm_rw_pages(struct page **pages, unsigned offset, size_t len, struct iov_iter *iter, - int vm_write, - ssize_t *bytes_copied) + int vm_write) { - *bytes_copied = 0; - /* Do the copy for each page */ - while (iov_iter_count(iter) && len) { + while (len && iov_iter_count(iter)) { struct page *page = *pages++; size_t copy = PAGE_SIZE - offset; size_t copied; @@ -67,7 +64,6 @@ static int process_vm_rw_pages(struct page **pages, } else { copied = copy_page_to_iter(page, offset, copy, iter); } - *bytes_copied += copied; len -= copied; if (copied < copy && iov_iter_count(iter)) return -EFAULT; @@ -101,20 +97,16 @@ static int process_vm_rw_single_vec(unsigned long addr, struct page **process_pages, struct mm_struct *mm, struct task_struct *task, - int vm_write, - ssize_t *bytes_copied) + int vm_write) { unsigned long pa = addr & PAGE_MASK; unsigned long start_offset = addr - pa; unsigned long nr_pages; - ssize_t bytes_copied_loop; ssize_t rc = 0; unsigned long nr_pages_copied = 0; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); - *bytes_copied = 0; - /* Work out address and page range required */ if (len == 0) return 0; @@ -143,11 +135,9 @@ static int process_vm_rw_single_vec(unsigned long addr, rc = process_vm_rw_pages(process_pages, start_offset, n, iter, - vm_write, - &bytes_copied_loop); + vm_write); len -= n; start_offset = 0; - *bytes_copied += bytes_copied_loop; nr_pages_copied += pages_pinned; pa += pages_pinned * PAGE_SIZE; while (pages_pinned) @@ -187,11 +177,10 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, struct mm_struct *mm; unsigned long i; ssize_t rc = 0; - ssize_t bytes_copied_loop; - ssize_t bytes_copied = 0; unsigned long nr_pages = 0; unsigned long nr_pages_iov; ssize_t iov_len; + size_t total_len = iov_iter_count(iter); /* * Work out how many pages of struct pages we're going to need @@ -245,24 +234,20 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, goto put_task_struct; } - for (i = 0; i < riovcnt && iov_iter_count(iter); i++) { + for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++) rc = process_vm_rw_single_vec( (unsigned long)rvec[i].iov_base, rvec[i].iov_len, - iter, process_pages, mm, task, vm_write, - &bytes_copied_loop); - bytes_copied += bytes_copied_loop; - if (rc != 0) { - /* If we have managed to copy any data at all then - we return the number of bytes copied. Otherwise - we return the error code */ - if (bytes_copied) - rc = bytes_copied; - goto put_mm; - } - } + iter, process_pages, mm, task, vm_write); + + /* copied = space before - space after */ + total_len -= iov_iter_count(iter); + + /* If we have managed to copy any data at all then + we return the number of bytes copied. Otherwise + we return the error code */ + if (total_len) + rc = total_len; - rc = bytes_copied; -put_mm: mmput(mm); put_task_struct: From 4bafbec7bf60ed56ccbb36a96091bdbd162f075d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Feb 2014 13:25:32 -0500 Subject: [PATCH 44/63] process_vm_access: tidy up a bit saner variable names, update linuxdoc comments, etc. Signed-off-by: Al Viro --- mm/process_vm_access.c | 59 ++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 40 deletions(-) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 15e4c2f312e8..d6bd3fdd6925 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -23,20 +23,11 @@ /** * process_vm_rw_pages - read/write pages from task specified - * @task: task to read/write from - * @mm: mm for task - * @process_pages: struct pages area that can store at least - * nr_pages_to_copy struct page pointers - * @pa: address of page in task to start copying from/to + * @pages: array of pointers to pages we want to copy * @start_offset: offset in page to start copying from/to * @len: number of bytes to copy - * @lvec: iovec array specifying where to copy to/from - * @lvec_cnt: number of elements in iovec array - * @lvec_current: index in iovec array we are up to - * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @iter: where to copy to/from locally * @vm_write: 0 means copy from, 1 means copy to - * @nr_pages_to_copy: number of pages to copy - * @bytes_copied: returns number of bytes successfully copied * Returns 0 on success, error code otherwise */ static int process_vm_rw_pages(struct page **pages, @@ -79,16 +70,12 @@ static int process_vm_rw_pages(struct page **pages, * process_vm_rw_single_vec - read/write pages from task specified * @addr: start memory address of target process * @len: size of area to copy to/from - * @lvec: iovec array specifying where to copy to/from locally - * @lvec_cnt: number of elements in iovec array - * @lvec_current: index in iovec array we are up to - * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @iter: where to copy to/from locally * @process_pages: struct pages area that can store at least * nr_pages_to_copy struct page pointers * @mm: mm for task * @task: task to read/write from * @vm_write: 0 means copy from, 1 means copy to - * @bytes_copied: returns number of bytes successfully copied * Returns 0 on success or on failure error code */ static int process_vm_rw_single_vec(unsigned long addr, @@ -103,7 +90,6 @@ static int process_vm_rw_single_vec(unsigned long addr, unsigned long start_offset = addr - pa; unsigned long nr_pages; ssize_t rc = 0; - unsigned long nr_pages_copied = 0; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); @@ -112,38 +98,32 @@ static int process_vm_rw_single_vec(unsigned long addr, return 0; nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; - while ((nr_pages_copied < nr_pages) && iov_iter_count(iter)) { - int nr_pages_to_copy; - int pages_pinned; - size_t n; - nr_pages_to_copy = min(nr_pages - nr_pages_copied, - max_pages_per_loop); + while (!rc && nr_pages && iov_iter_count(iter)) { + int pages = min(nr_pages, max_pages_per_loop); + size_t bytes; /* Get the pages we're interested in */ down_read(&mm->mmap_sem); - pages_pinned = get_user_pages(task, mm, pa, - nr_pages_to_copy, - vm_write, 0, process_pages, NULL); + pages = get_user_pages(task, mm, pa, pages, + vm_write, 0, process_pages, NULL); up_read(&mm->mmap_sem); - if (pages_pinned <= 0) + if (pages <= 0) return -EFAULT; - n = pages_pinned * PAGE_SIZE - start_offset; - if (n > len) - n = len; + bytes = pages * PAGE_SIZE - start_offset; + if (bytes > len) + bytes = len; rc = process_vm_rw_pages(process_pages, - start_offset, n, iter, + start_offset, bytes, iter, vm_write); - len -= n; + len -= bytes; start_offset = 0; - nr_pages_copied += pages_pinned; - pa += pages_pinned * PAGE_SIZE; - while (pages_pinned) - put_page(process_pages[--pages_pinned]); - if (rc < 0) - break; + nr_pages -= pages; + pa += pages * PAGE_SIZE; + while (pages) + put_page(process_pages[--pages]); } return rc; @@ -156,8 +136,7 @@ static int process_vm_rw_single_vec(unsigned long addr, /** * process_vm_rw_core - core of reading/writing pages from task specified * @pid: PID of process to read/write from/to - * @lvec: iovec array specifying where to copy to/from locally - * @liovcnt: size of lvec array + * @iter: where to copy to/from locally * @rvec: iovec array specifying where to copy to/from in the other process * @riovcnt: size of rvec array * @flags: currently unused From 4f18cd317a118c28482f97303600a2fe2ada6c79 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 5 Feb 2014 19:11:33 -0500 Subject: [PATCH 45/63] take iov_iter stuff to mm/iov_iter.c Signed-off-by: Al Viro --- mm/Makefile | 2 +- mm/filemap.c | 221 ------------------------------------------------- mm/iov_iter.c | 224 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 225 insertions(+), 222 deletions(-) create mode 100644 mm/iov_iter.c diff --git a/mm/Makefile b/mm/Makefile index 310c90a09264..178a43406b0c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -17,7 +17,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o balloon_compaction.o \ - interval_tree.o list_lru.o $(mmu-y) + interval_tree.o list_lru.o iov_iter.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/filemap.c b/mm/filemap.c index a16eb2c4f316..c4730efa5d9e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1085,84 +1085,6 @@ static void shrink_readahead_size_eio(struct file *filp, ra->ra_pages /= 4; } -size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - void *kaddr, *from; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - if (!fault_in_pages_writeable(buf, copy)) { - kaddr = kmap_atomic(page); - from = kaddr + offset; - - /* first chunk, usually the only one */ - left = __copy_to_user_inatomic(buf, from, copy); - copy -= left; - skip += copy; - from += copy; - bytes -= copy; - - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_to_user_inatomic(buf, from, copy); - copy -= left; - skip = copy; - from += copy; - bytes -= copy; - } - if (likely(!bytes)) { - kunmap_atomic(kaddr); - goto done; - } - offset = from - kaddr; - buf += copy; - kunmap_atomic(kaddr); - copy = min(bytes, iov->iov_len - skip); - } - /* Too bad - revert to non-atomic kmap */ - kaddr = kmap(page); - from = kaddr + offset; - left = __copy_to_user(buf, from, copy); - copy -= left; - skip += copy; - from += copy; - bytes -= copy; - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_to_user(buf, from, copy); - copy -= left; - skip = copy; - from += copy; - bytes -= copy; - } - kunmap(page); -done: - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; -} -EXPORT_SYMBOL(copy_page_to_iter); - /** * do_generic_file_read - generic file read routine * @filp: the file to read @@ -1957,149 +1879,6 @@ struct page *read_cache_page(struct address_space *mapping, } EXPORT_SYMBOL(read_cache_page); -static size_t __iovec_copy_from_user_inatomic(char *vaddr, - const struct iovec *iov, size_t base, size_t bytes) -{ - size_t copied = 0, left = 0; - - while (bytes) { - char __user *buf = iov->iov_base + base; - int copy = min(bytes, iov->iov_len - base); - - base = 0; - left = __copy_from_user_inatomic(vaddr, buf, copy); - copied += copy; - bytes -= copy; - vaddr += copy; - iov++; - - if (unlikely(left)) - break; - } - return copied - left; -} - -/* - * Copy as much as we can into the page and return the number of bytes which - * were successfully copied. If a fault is encountered then return the number of - * bytes which were copied. - */ -size_t iov_iter_copy_from_user_atomic(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - kaddr = kmap_atomic(page); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap_atomic(kaddr); - - return copied; -} -EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); - -/* - * This has the same sideeffects and return value as - * iov_iter_copy_from_user_atomic(). - * The difference is that it attempts to resolve faults. - * Page must not be locked. - */ -size_t iov_iter_copy_from_user(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr; - size_t copied; - - kaddr = kmap(page); - if (likely(i->nr_segs == 1)) { - int left; - char __user *buf = i->iov->iov_base + i->iov_offset; - left = __copy_from_user(kaddr + offset, buf, bytes); - copied = bytes - left; - } else { - copied = __iovec_copy_from_user_inatomic(kaddr + offset, - i->iov, i->iov_offset, bytes); - } - kunmap(page); - return copied; -} -EXPORT_SYMBOL(iov_iter_copy_from_user); - -void iov_iter_advance(struct iov_iter *i, size_t bytes) -{ - BUG_ON(i->count < bytes); - - if (likely(i->nr_segs == 1)) { - i->iov_offset += bytes; - i->count -= bytes; - } else { - const struct iovec *iov = i->iov; - size_t base = i->iov_offset; - unsigned long nr_segs = i->nr_segs; - - /* - * The !iov->iov_len check ensures we skip over unlikely - * zero-length segments (without overruning the iovec). - */ - while (bytes || unlikely(i->count && !iov->iov_len)) { - int copy; - - copy = min(bytes, iov->iov_len - base); - BUG_ON(!i->count || i->count < copy); - i->count -= copy; - bytes -= copy; - base += copy; - if (iov->iov_len == base) { - iov++; - nr_segs--; - base = 0; - } - } - i->iov = iov; - i->iov_offset = base; - i->nr_segs = nr_segs; - } -} -EXPORT_SYMBOL(iov_iter_advance); - -/* - * Fault in the first iovec of the given iov_iter, to a maximum length - * of bytes. Returns 0 on success, or non-zero if the memory could not be - * accessed (ie. because it is an invalid address). - * - * writev-intensive code may want this to prefault several iovecs -- that - * would be possible (callers must not rely on the fact that _only_ the - * first iovec will be faulted with the current implementation). - */ -int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) -{ - char __user *buf = i->iov->iov_base + i->iov_offset; - bytes = min(bytes, i->iov->iov_len - i->iov_offset); - return fault_in_pages_readable(buf, bytes); -} -EXPORT_SYMBOL(iov_iter_fault_in_readable); - -/* - * Return the count of just the current iov_iter segment. - */ -size_t iov_iter_single_seg_count(const struct iov_iter *i) -{ - const struct iovec *iov = i->iov; - if (i->nr_segs == 1) - return i->count; - else - return min(i->count, iov->iov_len - i->iov_offset); -} -EXPORT_SYMBOL(iov_iter_single_seg_count); - /* * Performs necessary checks before doing a write * diff --git a/mm/iov_iter.c b/mm/iov_iter.c new file mode 100644 index 000000000000..10e46cd721de --- /dev/null +++ b/mm/iov_iter.c @@ -0,0 +1,224 @@ +#include +#include +#include + +size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, + struct iov_iter *i) +{ + size_t skip, copy, left, wanted; + const struct iovec *iov; + char __user *buf; + void *kaddr, *from; + + if (unlikely(bytes > i->count)) + bytes = i->count; + + if (unlikely(!bytes)) + return 0; + + wanted = bytes; + iov = i->iov; + skip = i->iov_offset; + buf = iov->iov_base + skip; + copy = min(bytes, iov->iov_len - skip); + + if (!fault_in_pages_writeable(buf, copy)) { + kaddr = kmap_atomic(page); + from = kaddr + offset; + + /* first chunk, usually the only one */ + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user_inatomic(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + if (likely(!bytes)) { + kunmap_atomic(kaddr); + goto done; + } + offset = from - kaddr; + buf += copy; + kunmap_atomic(kaddr); + copy = min(bytes, iov->iov_len - skip); + } + /* Too bad - revert to non-atomic kmap */ + kaddr = kmap(page); + from = kaddr + offset; + left = __copy_to_user(buf, from, copy); + copy -= left; + skip += copy; + from += copy; + bytes -= copy; + while (unlikely(!left && bytes)) { + iov++; + buf = iov->iov_base; + copy = min(bytes, iov->iov_len); + left = __copy_to_user(buf, from, copy); + copy -= left; + skip = copy; + from += copy; + bytes -= copy; + } + kunmap(page); +done: + i->count -= wanted - bytes; + i->nr_segs -= iov - i->iov; + i->iov = iov; + i->iov_offset = skip; + return wanted - bytes; +} +EXPORT_SYMBOL(copy_page_to_iter); + +static size_t __iovec_copy_from_user_inatomic(char *vaddr, + const struct iovec *iov, size_t base, size_t bytes) +{ + size_t copied = 0, left = 0; + + while (bytes) { + char __user *buf = iov->iov_base + base; + int copy = min(bytes, iov->iov_len - base); + + base = 0; + left = __copy_from_user_inatomic(vaddr, buf, copy); + copied += copy; + bytes -= copy; + vaddr += copy; + iov++; + + if (unlikely(left)) + break; + } + return copied - left; +} + +/* + * Copy as much as we can into the page and return the number of bytes which + * were successfully copied. If a fault is encountered then return the number of + * bytes which were copied. + */ +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap_atomic(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_inatomic(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap_atomic(kaddr); + + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); + +/* + * This has the same sideeffects and return value as + * iov_iter_copy_from_user_atomic(). + * The difference is that it attempts to resolve faults. + * Page must not be locked. + */ +size_t iov_iter_copy_from_user(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap(page); + return copied; +} +EXPORT_SYMBOL(iov_iter_copy_from_user); + +void iov_iter_advance(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + + if (likely(i->nr_segs == 1)) { + i->iov_offset += bytes; + i->count -= bytes; + } else { + const struct iovec *iov = i->iov; + size_t base = i->iov_offset; + unsigned long nr_segs = i->nr_segs; + + /* + * The !iov->iov_len check ensures we skip over unlikely + * zero-length segments (without overruning the iovec). + */ + while (bytes || unlikely(i->count && !iov->iov_len)) { + int copy; + + copy = min(bytes, iov->iov_len - base); + BUG_ON(!i->count || i->count < copy); + i->count -= copy; + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + nr_segs--; + base = 0; + } + } + i->iov = iov; + i->iov_offset = base; + i->nr_segs = nr_segs; + } +} +EXPORT_SYMBOL(iov_iter_advance); + +/* + * Fault in the first iovec of the given iov_iter, to a maximum length + * of bytes. Returns 0 on success, or non-zero if the memory could not be + * accessed (ie. because it is an invalid address). + * + * writev-intensive code may want this to prefault several iovecs -- that + * would be possible (callers must not rely on the fact that _only_ the + * first iovec will be faulted with the current implementation). + */ +int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) +{ + char __user *buf = i->iov->iov_base + i->iov_offset; + bytes = min(bytes, i->iov->iov_len - i->iov_offset); + return fault_in_pages_readable(buf, bytes); +} +EXPORT_SYMBOL(iov_iter_fault_in_readable); + +/* + * Return the count of just the current iov_iter segment. + */ +size_t iov_iter_single_seg_count(const struct iov_iter *i) +{ + const struct iovec *iov = i->iov; + if (i->nr_segs == 1) + return i->count; + else + return min(i->count, iov->iov_len - i->iov_offset); +} +EXPORT_SYMBOL(iov_iter_single_seg_count); From 66f5dcef137beef14560f1f6eba6717028b89089 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 8 Feb 2014 13:27:03 -0500 Subject: [PATCH 46/63] ocfs2: don't open-code kernel_sendmsg() Signed-off-by: Al Viro --- fs/ocfs2/cluster/tcp.c | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 2cd2406b4140..68d80c316381 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -940,33 +940,21 @@ static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, size_t veclen, size_t total) { int ret; - mm_segment_t oldfs; - struct msghdr msg = { - .msg_iov = (struct iovec *)vec, - .msg_iovlen = veclen, - }; + struct msghdr msg; if (sock == NULL) { ret = -EINVAL; goto out; } - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_sendmsg(sock, &msg, total); - set_fs(oldfs); - if (ret != total) { - mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, - total); - if (ret >= 0) - ret = -EPIPE; /* should be smarter, I bet */ - goto out; - } - - ret = 0; + ret = kernel_sendmsg(sock, &msg, vec, veclen, total); + if (likely(ret == total)) + return 0; + mlog(ML_ERROR, "sendmsg returned %d instead of %zu\n", ret, total); + if (ret >= 0) + ret = -EPIPE; /* should be smarter, I bet */ out: - if (ret < 0) - mlog(0, "returning error: %d\n", ret); + mlog(0, "returning error: %d\n", ret); return ret; } From 480f40de91e74190281309fd0ecb2d0414603c2e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 8 Feb 2014 13:44:43 -0500 Subject: [PATCH 47/63] lustre: switch to kernel_sendmsg() (casts are due to misannotations in lustre; it uses iovec where kvec would be correct type; too much noise to properly annotate right now). Signed-off-by: Al Viro --- .../lnet/klnds/socklnd/socklnd_lib-linux.c | 30 +++---------------- .../lustre/lustre/libcfs/linux/linux-tcpip.c | 12 ++------ 2 files changed, 6 insertions(+), 36 deletions(-) diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c index 80141aa32c21..c3b67165883b 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c @@ -99,16 +99,7 @@ ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = tx->tx_niov; #endif - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = scratchiov, - .msg_iovlen = niov, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = MSG_DONTWAIT - }; - mm_segment_t oldmm = get_fs(); + struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; int i; for (nob = i = 0; i < niov; i++) { @@ -120,9 +111,7 @@ ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) nob < tx->tx_resid) msg.msg_flags |= MSG_MORE; - set_fs (KERNEL_DS); - rc = sock_sendmsg(sock, &msg, nob); - set_fs (oldmm); + rc = kernel_sendmsg(sock, &msg, (struct kvec *)scratchiov, niov, nob); } return rc; } @@ -174,16 +163,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = tx->tx_nkiov; #endif - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = scratchiov, - .msg_iovlen = niov, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = MSG_DONTWAIT - }; - mm_segment_t oldmm = get_fs(); + struct msghdr msg = {.msg_flags = MSG_DONTWAIT}; int i; for (nob = i = 0; i < niov; i++) { @@ -196,9 +176,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) nob < tx->tx_resid) msg.msg_flags |= MSG_MORE; - set_fs (KERNEL_DS); - rc = sock_sendmsg(sock, &msg, nob); - set_fs (oldmm); + rc = kernel_sendmsg(sock, &msg, (struct kvec *)scratchiov, niov, nob); for (i = 0; i < niov; i++) kunmap(kiov[i].kiov_page); diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c index e6069d78af6b..9bd08d666413 100644 --- a/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c @@ -265,17 +265,11 @@ libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout) * empty enough to take the whole message immediately */ for (;;) { - struct iovec iov = { + struct kvec iov = { .iov_base = buffer, .iov_len = nob }; struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, .msg_flags = (timeout == 0) ? MSG_DONTWAIT : 0 }; @@ -297,11 +291,9 @@ libcfs_sock_write (struct socket *sock, void *buffer, int nob, int timeout) } } - set_fs (KERNEL_DS); then = jiffies; - rc = sock_sendmsg (sock, &msg, iov.iov_len); + rc = kernel_sendmsg(sock, &msg, &iov, 1, nob); ticks -= jiffies - then; - set_fs (oldmm); if (rc == nob) return 0; From 86d564c84c38b1ec06d9f2120d6a7373dcaeff0c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 8 Feb 2014 20:42:52 -0500 Subject: [PATCH 48/63] constify blk_rq_map_user_iov() and friends sg_iovec array passed to it can be const Signed-off-by: Al Viro --- block/blk-map.c | 2 +- fs/bio.c | 10 +++++----- include/linux/bio.h | 5 +++-- include/linux/blkdev.h | 4 ++-- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index ae4ae1047fd9..86d93779c066 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -188,7 +188,7 @@ EXPORT_SYMBOL(blk_rq_map_user); * unmapping. */ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, - struct rq_map_data *map_data, struct sg_iovec *iov, + struct rq_map_data *map_data, const struct sg_iovec *iov, int iov_count, unsigned int len, gfp_t gfp_mask) { struct bio *bio; diff --git a/fs/bio.c b/fs/bio.c index 8754e7b6eb49..7065837ae9f3 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1003,7 +1003,7 @@ struct bio_map_data { }; static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int is_our_pages) { memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); @@ -1023,7 +1023,7 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, sizeof(struct sg_iovec) * iov_count, gfp_mask); } -static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, +static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count, int to_user, int from_user, int do_free_page) { int ret = 0, i; @@ -1121,7 +1121,7 @@ EXPORT_SYMBOL(bio_uncopy_user); */ struct bio *bio_copy_user_iov(struct request_queue *q, struct rq_map_data *map_data, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int write_to_vm, gfp_t gfp_mask) { struct bio_map_data *bmd; @@ -1260,7 +1260,7 @@ EXPORT_SYMBOL(bio_copy_user); static struct bio *__bio_map_user_iov(struct request_queue *q, struct block_device *bdev, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int write_to_vm, gfp_t gfp_mask) { int i, j; @@ -1408,7 +1408,7 @@ EXPORT_SYMBOL(bio_map_user); * device. Returns an error pointer in case of error. */ struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, - struct sg_iovec *iov, int iov_count, + const struct sg_iovec *iov, int iov_count, int write_to_vm, gfp_t gfp_mask) { struct bio *bio; diff --git a/include/linux/bio.h b/include/linux/bio.h index 5a4d39b4686b..21e27208316c 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -388,7 +388,7 @@ struct sg_iovec; struct rq_map_data; extern struct bio *bio_map_user_iov(struct request_queue *, struct block_device *, - struct sg_iovec *, int, int, gfp_t); + const struct sg_iovec *, int, int, gfp_t); extern void bio_unmap_user(struct bio *); extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int, gfp_t); @@ -414,7 +414,8 @@ extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, unsigned long, unsigned int, int, gfp_t); extern struct bio *bio_copy_user_iov(struct request_queue *, - struct rq_map_data *, struct sg_iovec *, + struct rq_map_data *, + const struct sg_iovec *, int, int, gfp_t); extern int bio_uncopy_user(struct bio *); void zero_fill_bio(struct bio *bio); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4afa4f8f6090..a639fd8a6d7b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -823,8 +823,8 @@ extern int blk_rq_map_user(struct request_queue *, struct request *, extern int blk_rq_unmap_user(struct bio *); extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); extern int blk_rq_map_user_iov(struct request_queue *, struct request *, - struct rq_map_data *, struct sg_iovec *, int, - unsigned int, gfp_t); + struct rq_map_data *, const struct sg_iovec *, + int, unsigned int, gfp_t); extern int blk_execute_rq(struct request_queue *, struct gendisk *, struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, From f730c848affc05fb7262574b06e0cd7e1fa96096 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 8 Feb 2014 21:07:38 -0500 Subject: [PATCH 49/63] drbd: don't open-code kernel_recvmsg() Signed-off-by: Al Viro --- drivers/block/drbd/drbd_receiver.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index d073305ffd5e..1385714eccb7 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -468,24 +468,14 @@ static void drbd_wait_ee_list_empty(struct drbd_conf *mdev, static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags) { - mm_segment_t oldfs; struct kvec iov = { .iov_base = buf, .iov_len = size, }; struct msghdr msg = { - .msg_iovlen = 1, - .msg_iov = (struct iovec *)&iov, .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) }; - int rv; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - rv = sock_recvmsg(sock, &msg, size, msg.msg_flags); - set_fs(oldfs); - - return rv; + return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags); } static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size) From 920220c11122952061f2d81a9f06ca077bb744d9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 8 Feb 2014 21:09:07 -0500 Subject: [PATCH 50/63] ocfs2: don't open-code kernel_recvmsg() Signed-off-by: Al Viro --- fs/ocfs2/cluster/tcp.c | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 68d80c316381..ea63d6461f55 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -916,24 +916,9 @@ static struct o2net_msg_handler *o2net_handler_get(u32 msg_type, u32 key) static int o2net_recv_tcp_msg(struct socket *sock, void *data, size_t len) { - int ret; - mm_segment_t oldfs; - struct kvec vec = { - .iov_len = len, - .iov_base = data, - }; - struct msghdr msg = { - .msg_iovlen = 1, - .msg_iov = (struct iovec *)&vec, - .msg_flags = MSG_DONTWAIT, - }; - - oldfs = get_fs(); - set_fs(get_ds()); - ret = sock_recvmsg(sock, &msg, len, msg.msg_flags); - set_fs(oldfs); - - return ret; + struct kvec vec = { .iov_len = len, .iov_base = data, }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT, }; + return kernel_recvmsg(sock, &msg, &vec, 1, len, msg.msg_flags); } static int o2net_send_tcp_msg(struct socket *sock, struct kvec *vec, From b2f42cfeeb0452ca3e004c3014cda99b53554d47 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 9 Feb 2014 12:07:46 -0500 Subject: [PATCH 51/63] lustre: don't open-code kernel_recvmsg() Signed-off-by: Al Viro --- .../lnet/klnds/socklnd/socklnd_lib-linux.c | 30 +++++-------------- .../lustre/lustre/libcfs/linux/linux-tcpip.c | 12 ++------ 2 files changed, 9 insertions(+), 33 deletions(-) diff --git a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c index c3b67165883b..733c79e1f12d 100644 --- a/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c +++ b/drivers/staging/lustre/lnet/klnds/socklnd/socklnd_lib-linux.c @@ -215,15 +215,8 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) #endif struct iovec *iov = conn->ksnc_rx_iov; struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = scratchiov, - .msg_iovlen = niov, - .msg_control = NULL, - .msg_controllen = 0, .msg_flags = 0 }; - mm_segment_t oldmm = get_fs(); int nob; int i; int rc; @@ -241,10 +234,8 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) } LASSERT (nob <= conn->ksnc_rx_nob_wanted); - set_fs (KERNEL_DS); - rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT); - /* NB this is just a boolean..........................^ */ - set_fs (oldmm); + rc = kernel_recvmsg(conn->ksnc_sock, &msg, + (struct kvec *)scratchiov, niov, nob, MSG_DONTWAIT); saved_csum = 0; if (conn->ksnc_proto == &ksocknal_protocol_v2x) { @@ -333,14 +324,8 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) #endif lnet_kiov_t *kiov = conn->ksnc_rx_kiov; struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = scratchiov, - .msg_control = NULL, - .msg_controllen = 0, .msg_flags = 0 }; - mm_segment_t oldmm = get_fs(); int nob; int i; int rc; @@ -348,12 +333,13 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) void *addr; int sum; int fragnob; + int n; /* NB we can't trust socket ops to either consume our iovs * or leave them alone. */ if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) { nob = scratchiov[0].iov_len; - msg.msg_iovlen = 1; + n = 1; } else { for (nob = i = 0; i < niov; i++) { @@ -361,15 +347,13 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; } - msg.msg_iovlen = niov; + n = niov; } LASSERT (nob <= conn->ksnc_rx_nob_wanted); - set_fs (KERNEL_DS); - rc = sock_recvmsg (conn->ksnc_sock, &msg, nob, MSG_DONTWAIT); - /* NB this is just a boolean.......................^ */ - set_fs (oldmm); + rc = kernel_recvmsg(conn->ksnc_sock, &msg, + (struct kvec *)scratchiov, n, nob, MSG_DONTWAIT); if (conn->ksnc_msg.ksm_csum != 0) { for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { diff --git a/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c index 9bd08d666413..7539fe16d76f 100644 --- a/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c +++ b/drivers/staging/lustre/lustre/libcfs/linux/linux-tcpip.c @@ -330,17 +330,11 @@ libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout) LASSERT (ticks > 0); for (;;) { - struct iovec iov = { + struct kvec iov = { .iov_base = buffer, .iov_len = nob }; struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, .msg_flags = 0 }; @@ -359,11 +353,9 @@ libcfs_sock_read (struct socket *sock, void *buffer, int nob, int timeout) return rc; } - set_fs(KERNEL_DS); then = jiffies; - rc = sock_recvmsg(sock, &msg, iov.iov_len, 0); + rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0); ticks -= jiffies - then; - set_fs(oldmm); if (rc < 0) return rc; From 41fc56d573c35a212688b12b48af8c303f9bb6d2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 9 Feb 2014 12:58:52 -0500 Subject: [PATCH 52/63] kill the 4th argument of __generic_file_aio_write() It's always equal to &iocb->ki_pos, where iocb is the value of the 1st argument. Signed-off-by: Al Viro --- fs/block_dev.c | 2 +- fs/ext4/file.c | 2 +- fs/udf/file.c | 2 +- include/linux/fs.h | 3 +-- mm/filemap.c | 13 ++++++------- 5 files changed, 10 insertions(+), 12 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 1e86823a9cbd..764bd3b8d2fa 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1518,7 +1518,7 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); blk_start_plug(&plug); - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs); if (ret > 0) { ssize_t err; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 1a5073959f32..d564bcfb23c5 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -146,7 +146,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, overwrite = 1; } - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs); mutex_unlock(&inode->i_mutex); if (ret > 0) { diff --git a/fs/udf/file.c b/fs/udf/file.c index 1037637957c7..d2c170f8b035 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -171,7 +171,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } else up_write(&iinfo->i_data_sem); - retval = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + retval = __generic_file_aio_write(iocb, iov, nr_segs); mutex_unlock(&inode->i_mutex); if (retval > 0) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 2a5b1744f80a..e677d1e1189f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2392,8 +2392,7 @@ extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr, unsigned long size, pgoff_t pgoff); int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); -extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, - loff_t *); +extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long); extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, unsigned long *, loff_t, loff_t *, size_t, size_t); diff --git a/mm/filemap.c b/mm/filemap.c index c4730efa5d9e..ce2246dd90de 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2222,14 +2222,14 @@ EXPORT_SYMBOL(generic_file_buffered_write); * avoid syncing under i_mutex. */ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) + unsigned long nr_segs) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; size_t ocount; /* original count */ size_t count; /* after file limit checks */ struct inode *inode = mapping->host; - loff_t pos; + loff_t pos = iocb->ki_pos; ssize_t written; ssize_t err; @@ -2239,7 +2239,6 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, return err; count = ocount; - pos = *ppos; /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; @@ -2266,7 +2265,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, ssize_t written_buffered; written = generic_file_direct_write(iocb, iov, &nr_segs, pos, - ppos, count, ocount); + &iocb->ki_pos, count, ocount); if (written < 0 || written == count) goto out; /* @@ -2276,7 +2275,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, pos += written; count -= written; written_buffered = generic_file_buffered_write(iocb, iov, - nr_segs, pos, ppos, count, + nr_segs, pos, &iocb->ki_pos, count, written); /* * If generic_file_buffered_write() retuned a synchronous error @@ -2310,7 +2309,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } } else { written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, ppos, count, written); + pos, &iocb->ki_pos, count, written); } out: current->backing_dev_info = NULL; @@ -2339,7 +2338,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); mutex_lock(&inode->i_mutex); - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs); mutex_unlock(&inode->i_mutex); if (ret > 0) { From fcacafd269adc88f41b68cb77a3f957a66563428 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 9 Feb 2014 13:37:49 -0500 Subject: [PATCH 53/63] kill the 5th argument of generic_file_buffered_write() same story - it's &iocb->ki_pos in all cases Signed-off-by: Al Viro --- fs/ceph/file.c | 3 +-- fs/ocfs2/file.c | 2 +- fs/xfs/xfs_file.c | 2 +- include/linux/fs.h | 2 +- mm/filemap.c | 9 ++++----- 5 files changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 09c7afe32e49..a798db5e5e39 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -978,8 +978,7 @@ retry_snap: * can not run at the same time */ written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, &iocb->ki_pos, - count, 0); + pos, count, 0); mutex_unlock(&inode->i_mutex); } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 51632c40e896..89099cce14fe 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2383,7 +2383,7 @@ relock: } else { current->backing_dev_info = file->f_mapping->backing_dev_info; written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, - ppos, count, 0); + count, 0); current->backing_dev_info = NULL; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 64b48eade91d..175ce58fbfa3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -738,7 +738,7 @@ xfs_file_buffered_aio_write( write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); ret = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, &iocb->ki_pos, count, 0); + pos, count, 0); /* * If we just got an ENOSPC, try to write back all dirty inodes to diff --git a/include/linux/fs.h b/include/linux/fs.h index e677d1e1189f..830e37420f5e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2397,7 +2397,7 @@ extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsi extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, unsigned long *, loff_t, loff_t *, size_t, size_t); extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, - unsigned long, loff_t, loff_t *, size_t, ssize_t); + unsigned long, loff_t, size_t, ssize_t); extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); extern int generic_segment_checks(const struct iovec *iov, diff --git a/mm/filemap.c b/mm/filemap.c index ce2246dd90de..9d515a1a2372 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2183,7 +2183,7 @@ again: ssize_t generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, loff_t *ppos, + unsigned long nr_segs, loff_t pos, size_t count, ssize_t written) { struct file *file = iocb->ki_filp; @@ -2195,7 +2195,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, if (likely(status >= 0)) { written += status; - *ppos = pos + status; + iocb->ki_pos = pos + status; } return written ? written : status; @@ -2275,8 +2275,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, pos += written; count -= written; written_buffered = generic_file_buffered_write(iocb, iov, - nr_segs, pos, &iocb->ki_pos, count, - written); + nr_segs, pos, count, written); /* * If generic_file_buffered_write() retuned a synchronous error * then we want to return the number of bytes which were @@ -2309,7 +2308,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } } else { written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, &iocb->ki_pos, count, written); + pos, count, written); } out: current->backing_dev_info = NULL; From 867c4f9329e1bf7d0967bec761f033373f72b55e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 11 Feb 2014 19:31:06 -0500 Subject: [PATCH 54/63] btrfs_file_aio_write(): get rid of ppos Signed-off-by: Al Viro --- fs/btrfs/file.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 34e096201da1..f6032a2bfab9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1631,7 +1631,7 @@ again: static ssize_t __btrfs_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos, - loff_t *ppos, size_t count, size_t ocount) + size_t count, size_t ocount) { struct file *file = iocb->ki_filp; struct iov_iter i; @@ -1640,7 +1640,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, loff_t endbyte; int err; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos, + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, &iocb->ki_pos, count, ocount); if (written < 0 || written == count) @@ -1659,7 +1659,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, if (err) goto out; written += written_buffered; - *ppos = pos + written_buffered; + iocb->ki_pos = pos + written_buffered; invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT, endbyte >> PAGE_CACHE_SHIFT); out: @@ -1691,7 +1691,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; - loff_t *ppos = &iocb->ki_pos; u64 start_pos; ssize_t num_written = 0; ssize_t err = 0; @@ -1759,7 +1758,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, if (unlikely(file->f_flags & O_DIRECT)) { num_written = __btrfs_direct_write(iocb, iov, nr_segs, - pos, ppos, count, ocount); + pos, count, ocount); } else { struct iov_iter i; @@ -1767,7 +1766,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, num_written = __btrfs_buffered_write(file, &i, pos); if (num_written > 0) - *ppos = pos + num_written; + iocb->ki_pos = pos + num_written; } mutex_unlock(&inode->i_mutex); From 5cb6c6c7eb1ed24744b41fad47d9a25b72207098 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 11 Feb 2014 20:58:20 -0500 Subject: [PATCH 55/63] generic_file_direct_write(): get rid of ppos argument always equal to &iocb->ki_pos. Signed-off-by: Al Viro --- fs/btrfs/file.c | 2 +- fs/fuse/file.c | 3 +-- fs/ocfs2/file.c | 2 +- fs/xfs/xfs_file.c | 2 +- include/linux/fs.h | 2 +- mm/filemap.c | 6 +++--- 6 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f6032a2bfab9..8ed4b165abbd 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1640,7 +1640,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, loff_t endbyte; int err; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, &iocb->ki_pos, + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, count, ocount); if (written < 0 || written == count) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a91d3b4d32f3..fd06d1ebc2eb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1143,8 +1143,7 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, goto out; if (file->f_flags & O_DIRECT) { - written = generic_file_direct_write(iocb, iov, &nr_segs, - pos, &iocb->ki_pos, + written = generic_file_direct_write(iocb, iov, &nr_segs, pos, count, ocount); if (written < 0 || written == count) goto out; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 89099cce14fe..77b8a742866f 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2375,7 +2375,7 @@ relock: if (direct_io) { written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, - ppos, count, ocount); + count, ocount); if (written < 0) { ret = written; goto out_dio; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 175ce58fbfa3..e593554ce65e 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -699,7 +699,7 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); ret = generic_file_direct_write(iocb, iovp, - &nr_segs, pos, &iocb->ki_pos, count, ocount); + &nr_segs, pos, count, ocount); out: xfs_rw_iunlock(ip, iolock); diff --git a/include/linux/fs.h b/include/linux/fs.h index 830e37420f5e..9dfd7c7ff8e3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2395,7 +2395,7 @@ extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsig extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long); extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, - unsigned long *, loff_t, loff_t *, size_t, size_t); + unsigned long *, loff_t, size_t, size_t); extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, unsigned long, loff_t, size_t, ssize_t); extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); diff --git a/mm/filemap.c b/mm/filemap.c index 9d515a1a2372..93e9cf576452 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1985,7 +1985,7 @@ EXPORT_SYMBOL(pagecache_write_end); ssize_t generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long *nr_segs, loff_t pos, loff_t *ppos, + unsigned long *nr_segs, loff_t pos, size_t count, size_t ocount) { struct file *file = iocb->ki_filp; @@ -2046,7 +2046,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, i_size_write(inode, pos); mark_inode_dirty(inode); } - *ppos = pos; + iocb->ki_pos = pos; } out: return written; @@ -2265,7 +2265,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, ssize_t written_buffered; written = generic_file_direct_write(iocb, iov, &nr_segs, pos, - &iocb->ki_pos, count, ocount); + count, ocount); if (written < 0 || written == count) goto out; /* From 3b93f911d55cf8b3540f82fac4eeddfee7b5de0c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 11 Feb 2014 21:34:08 -0500 Subject: [PATCH 56/63] export generic_perform_write(), start getting rid of generic_file_buffer_write() Signed-off-by: Al Viro --- include/linux/fs.h | 2 ++ mm/filemap.c | 37 +++++++++++++++++++++---------------- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 9dfd7c7ff8e3..87be51aff9d7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -48,6 +48,7 @@ struct cred; struct swap_info_struct; struct seq_file; struct workqueue_struct; +struct iov_iter; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -2396,6 +2397,7 @@ extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, un extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, unsigned long *, loff_t, size_t, size_t); +extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t); extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, unsigned long, loff_t, size_t, ssize_t); extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); diff --git a/mm/filemap.c b/mm/filemap.c index 93e9cf576452..09bfc9b3bb51 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2092,7 +2092,7 @@ found: } EXPORT_SYMBOL(grab_cache_page_write_begin); -static ssize_t generic_perform_write(struct file *file, +ssize_t generic_perform_write(struct file *file, struct iov_iter *i, loff_t pos) { struct address_space *mapping = file->f_mapping; @@ -2180,6 +2180,7 @@ again: return written ? written : status; } +EXPORT_SYMBOL(generic_perform_write); ssize_t generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, @@ -2230,8 +2231,10 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, size_t count; /* after file limit checks */ struct inode *inode = mapping->host; loff_t pos = iocb->ki_pos; - ssize_t written; + ssize_t written = 0; ssize_t err; + ssize_t status; + struct iov_iter from; ocount = 0; err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); @@ -2242,8 +2245,6 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; - written = 0; - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); if (err) goto out; @@ -2259,44 +2260,47 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, if (err) goto out; + iov_iter_init(&from, iov, nr_segs, count, 0); + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ if (unlikely(file->f_flags & O_DIRECT)) { loff_t endbyte; - ssize_t written_buffered; - written = generic_file_direct_write(iocb, iov, &nr_segs, pos, + written = generic_file_direct_write(iocb, iov, &from.nr_segs, pos, count, ocount); if (written < 0 || written == count) goto out; + iov_iter_advance(&from, written); + /* * direct-io write to a hole: fall through to buffered I/O * for completing the rest of the request. */ pos += written; count -= written; - written_buffered = generic_file_buffered_write(iocb, iov, - nr_segs, pos, count, written); + + status = generic_perform_write(file, &from, pos); /* - * If generic_file_buffered_write() retuned a synchronous error + * If generic_perform_write() returned a synchronous error * then we want to return the number of bytes which were * direct-written, or the error code if that was zero. Note * that this differs from normal direct-io semantics, which * will return -EFOO even if some bytes were written. */ - if (written_buffered < 0) { - err = written_buffered; + if (unlikely(status < 0) && !written) { + err = status; goto out; } - + iocb->ki_pos = pos + status; /* * We need to ensure that the page cache pages are written to * disk and invalidated to preserve the expected O_DIRECT * semantics. */ - endbyte = pos + written_buffered - written - 1; + endbyte = pos + status - 1; err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); if (err == 0) { - written = written_buffered; + written += status; invalidate_mapping_pages(mapping, pos >> PAGE_CACHE_SHIFT, endbyte >> PAGE_CACHE_SHIFT); @@ -2307,8 +2311,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, */ } } else { - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, count, written); + written = generic_perform_write(file, &from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; } out: current->backing_dev_info = NULL; From 0a64bc2c0474640a850febd5ac3abb8d45b32821 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 11 Feb 2014 22:25:22 -0500 Subject: [PATCH 57/63] xfs_file_buffered_aio_write(): switch to generic_perform_write() Signed-off-by: Al Viro --- fs/xfs/xfs_file.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e593554ce65e..c3f4289f6497 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -715,7 +715,7 @@ xfs_file_buffered_aio_write( const struct iovec *iovp, unsigned long nr_segs, loff_t pos, - size_t ocount) + size_t count) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -724,7 +724,7 @@ xfs_file_buffered_aio_write( ssize_t ret; int enospc = 0; int iolock = XFS_IOLOCK_EXCL; - size_t count = ocount; + struct iov_iter from; xfs_rw_ilock(ip, iolock); @@ -732,14 +732,15 @@ xfs_file_buffered_aio_write( if (ret) goto out; + iov_iter_init(&from, iovp, nr_segs, count, 0); /* We can write back this queue in page reclaim */ current->backing_dev_info = mapping->backing_dev_info; write_retry: trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_buffered_write(iocb, iovp, nr_segs, - pos, count, 0); - + ret = generic_perform_write(file, &from, pos); + if (likely(ret >= 0)) + iocb->ki_pos = pos + ret; /* * If we just got an ENOSPC, try to write back all dirty inodes to * convert delalloc space to free up some of the excess reserved From aec605f4297d044a2a0d9ceedcf7d33af679c884 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 11 Feb 2014 22:28:43 -0500 Subject: [PATCH 58/63] ceph_aio_write(): switch to generic_perform_write() Signed-off-by: Al Viro --- fs/ceph/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index a798db5e5e39..2d9088b1bcd9 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -970,6 +970,7 @@ retry_snap: goto retry_snap; } } else { + struct iov_iter from; /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -977,8 +978,10 @@ retry_snap: * are pending vmtruncate. So write and vmtruncate * can not run at the same time */ - written = generic_file_buffered_write(iocb, iov, nr_segs, - pos, count, 0); + iov_iter_init(&from, iov, nr_segs, count, 0); + written = generic_perform_write(file, &from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; mutex_unlock(&inode->i_mutex); } From 58bfab395b302306baccbd1b5f38a9b890acb4e3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 11 Feb 2014 22:34:52 -0500 Subject: [PATCH 59/63] ocfs2_file_aio_write(): switch to generic_perform_write() Signed-off-by: Al Viro --- fs/ocfs2/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 77b8a742866f..9c27adf4ac72 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2381,9 +2381,12 @@ relock: goto out_dio; } } else { + struct iov_iter from; + iov_iter_init(&from, iov, nr_segs, count, 0); current->backing_dev_info = file->f_mapping->backing_dev_info; - written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, - count, 0); + written = generic_perform_write(file, &from, *ppos); + if (likely(written >= 0)) + iocb->ki_pos = *ppos + written; current->backing_dev_info = NULL; } From ccad2365668f12336dd497d9039e8584af836070 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 11 Feb 2014 22:36:48 -0500 Subject: [PATCH 60/63] kill generic_file_buffered_write() Signed-off-by: Al Viro --- include/linux/fs.h | 2 -- mm/filemap.c | 23 +---------------------- 2 files changed, 1 insertion(+), 24 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 87be51aff9d7..c309b7a0da2d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2398,8 +2398,6 @@ extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsi extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, unsigned long *, loff_t, size_t, size_t); extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t); -extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, - unsigned long, loff_t, size_t, ssize_t); extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos); extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos); extern int generic_segment_checks(const struct iovec *iov, diff --git a/mm/filemap.c b/mm/filemap.c index 09bfc9b3bb51..1a455142784d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -76,7 +76,7 @@ * ->mmap_sem * ->lock_page (access_process_vm) * - * ->i_mutex (generic_file_buffered_write) + * ->i_mutex (generic_perform_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) * * bdi->wb.list_lock @@ -2182,27 +2182,6 @@ again: } EXPORT_SYMBOL(generic_perform_write); -ssize_t -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, - size_t count, ssize_t written) -{ - struct file *file = iocb->ki_filp; - ssize_t status; - struct iov_iter i; - - iov_iter_init(&i, iov, nr_segs, count, written); - status = generic_perform_write(file, &i, pos); - - if (likely(status >= 0)) { - written += status; - iocb->ki_pos = pos + status; - } - - return written ? written : status; -} -EXPORT_SYMBOL(generic_file_buffered_write); - /** * __generic_file_aio_write - write data to a file * @iocb: IO state structure (file, offset, etc.) From eab87235c0f5979503a547f836a93a3d327c4201 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 3 Apr 2014 22:44:19 -0400 Subject: [PATCH 61/63] ceph_sync_{,direct_}write: fix an oops on ceph_osdc_new_request() failure ceph_osdc_put_request(ERR_PTR(-error)) oopses. What we want there is break, not goto out. Signed-off-by: Al Viro --- fs/ceph/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 2d9088b1bcd9..359805b671b9 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -600,7 +600,7 @@ ceph_sync_direct_write(struct kiocb *iocb, const struct iovec *iov, false); if (IS_ERR(req)) { ret = PTR_ERR(req); - goto out; + break; } num_pages = calc_pages_for(page_align, len); @@ -718,7 +718,7 @@ static ssize_t ceph_sync_write(struct kiocb *iocb, const struct iovec *iov, false); if (IS_ERR(req)) { ret = PTR_ERR(req); - goto out; + break; } /* From 19dfc1f5f2ef03a52aa30c8257c5745edef23f55 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 3 Apr 2014 10:27:17 -0400 Subject: [PATCH 62/63] cifs: fix the race in cifs_writev() O_APPEND handling there hadn't been completely fixed by Pavel's patch; it checks the right value, but it's racy - we can't really do that until i_mutex has been taken. Fix by switching to __generic_file_aio_write() (open-coding generic_file_aio_write(), actually) and pulling mutex_lock() above inode_size_read(). Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/cifs/file.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 3443b8f8e1c0..5bac2763c450 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2579,19 +2579,32 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov, struct cifsInodeInfo *cinode = CIFS_I(inode); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; ssize_t rc = -EACCES; - loff_t lock_pos = pos; + loff_t lock_pos = iocb->ki_pos; - if (file->f_flags & O_APPEND) - lock_pos = i_size_read(inode); /* * We need to hold the sem to be sure nobody modifies lock list * with a brlock that prevents writing. */ down_read(&cinode->lock_sem); + mutex_lock(&inode->i_mutex); + if (file->f_flags & O_APPEND) + lock_pos = i_size_read(inode); if (!cifs_find_lock_conflict(cfile, lock_pos, iov_length(iov, nr_segs), server->vals->exclusive_lock_type, NULL, - CIFS_WRITE_OP)) - rc = generic_file_aio_write(iocb, iov, nr_segs, pos); + CIFS_WRITE_OP)) { + rc = __generic_file_aio_write(iocb, iov, nr_segs); + mutex_unlock(&inode->i_mutex); + + if (rc > 0) { + ssize_t err; + + err = generic_write_sync(file, iocb->ki_pos - rc, rc); + if (rc < 0) + rc = err; + } + } else { + mutex_unlock(&inode->i_mutex); + } up_read(&cinode->lock_sem); return rc; } From a786c06d9f2719203c00b3d97b21f9a96980d0b5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 11 Apr 2014 12:01:03 -0400 Subject: [PATCH 63/63] missing bits of "splice: fix racy pipe->buffers uses" that commit has fixed only the parts of that mess in fs/splice.c itself; there had been more in several other ->splice_read() instances... Signed-off-by: Al Viro --- kernel/relay.c | 2 +- kernel/trace/trace.c | 4 ++-- mm/shmem.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/relay.c b/kernel/relay.c index 98833f664fb6..7d38607649a3 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1251,7 +1251,7 @@ static ssize_t subbuf_splice_actor(struct file *in, subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; pidx = (read_start / PAGE_SIZE) % subbuf_pages; poff = read_start & ~PAGE_MASK; - nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers); + nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max); for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { unsigned int this_len, this_end, private; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7511de35257f..27924caaa124 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4410,7 +4410,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ - for (i = 0, rem = len; i < pipe->buffers && rem; i++) { + for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { spd.pages[i] = alloc_page(GFP_KERNEL); if (!spd.pages[i]) break; @@ -5267,7 +5267,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, trace_access_lock(iter->cpu_file); entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); - for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { + for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) { struct page *page; int r; diff --git a/mm/shmem.c b/mm/shmem.c index 17d3799d04bd..37400a148f29 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1613,7 +1613,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, index = *ppos >> PAGE_CACHE_SHIFT; loff = *ppos & ~PAGE_CACHE_MASK; req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - nr_pages = min(req_pages, pipe->buffers); + nr_pages = min(req_pages, spd.nr_pages_max); spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);