From 66ee59af630fd8d5f4f56fb28162857e629aa0ab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Feb 2015 19:56:46 +0100 Subject: [PATCH 01/17] fs: remove ki_nbytes There is no need to pass the total request length in the kiocb, as we already get passed in through the iov_iter argument. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/aio.c | 34 ++++++++++++++++++---------------- fs/ceph/file.c | 2 +- fs/nfs/direct.c | 2 +- fs/ocfs2/file.c | 8 +++----- fs/read_write.c | 8 -------- fs/udf/file.c | 2 +- include/linux/aio.h | 1 - kernel/printk/printk.c | 2 +- mm/page_io.c | 1 - net/socket.c | 6 +++--- 10 files changed, 28 insertions(+), 38 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 118a2e0088d8..667054c7c067 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1344,12 +1344,13 @@ typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *); static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, int rw, char __user *buf, unsigned long *nr_segs, + size_t *len, struct iovec **iovec, bool compat) { ssize_t ret; - *nr_segs = kiocb->ki_nbytes; + *nr_segs = *len; #ifdef CONFIG_COMPAT if (compat) @@ -1364,21 +1365,22 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, if (ret < 0) return ret; - /* ki_nbytes now reflect bytes instead of segs */ - kiocb->ki_nbytes = ret; + /* len now reflect bytes instead of segs */ + *len = ret; return 0; } static ssize_t aio_setup_single_vector(struct kiocb *kiocb, int rw, char __user *buf, unsigned long *nr_segs, + size_t len, struct iovec *iovec) { - if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes))) + if (unlikely(!access_ok(!rw, buf, len))) return -EFAULT; iovec->iov_base = buf; - iovec->iov_len = kiocb->ki_nbytes; + iovec->iov_len = len; *nr_segs = 1; return 0; } @@ -1388,7 +1390,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb, * Performs the initial checks and io submission. */ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, - char __user *buf, bool compat) + char __user *buf, size_t len, bool compat) { struct file *file = req->ki_filp; ssize_t ret; @@ -1423,21 +1425,21 @@ rw_common: if (!rw_op && !iter_op) return -EINVAL; - ret = (opcode == IOCB_CMD_PREADV || - opcode == IOCB_CMD_PWRITEV) - ? aio_setup_vectored_rw(req, rw, buf, &nr_segs, - &iovec, compat) - : aio_setup_single_vector(req, rw, buf, &nr_segs, - iovec); + if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV) + ret = aio_setup_vectored_rw(req, rw, buf, &nr_segs, + &len, &iovec, compat); + else + ret = aio_setup_single_vector(req, rw, buf, &nr_segs, + len, iovec); if (!ret) - ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + ret = rw_verify_area(rw, file, &req->ki_pos, len); if (ret < 0) { if (iovec != inline_vecs) kfree(iovec); return ret; } - req->ki_nbytes = ret; + len = ret; /* XXX: move/kill - rw_verify_area()? */ /* This matches the pread()/pwrite() logic */ @@ -1450,7 +1452,7 @@ rw_common: file_start_write(file); if (iter_op) { - iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes); + iov_iter_init(&iter, rw, iovec, nr_segs, len); ret = iter_op(req, &iter); } else { ret = rw_op(req, iovec, nr_segs, req->ki_pos); @@ -1553,10 +1555,10 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_obj.user = user_iocb; req->ki_user_data = iocb->aio_data; req->ki_pos = iocb->aio_offset; - req->ki_nbytes = iocb->aio_nbytes; ret = aio_run_iocb(req, iocb->aio_lio_opcode, (char __user *)(unsigned long)iocb->aio_buf, + iocb->aio_nbytes, compat); if (ret) goto out_put_req; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 905986dd4c3c..081c4e3f9e49 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -807,7 +807,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *filp = iocb->ki_filp; struct ceph_file_info *fi = filp->private_data; - size_t len = iocb->ki_nbytes; + size_t len = iov_iter_count(to); struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); struct page *pinned_page = NULL; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 7077521acdf4..27cebf164070 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -265,7 +265,7 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t return -EINVAL; #else - VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); + VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (rw == READ) return nfs_file_direct_read(iocb, iter, pos); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 46e0d4e857c7..266845de2100 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2280,7 +2280,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, file->f_path.dentry->d_name.name, (unsigned int)from->nr_segs); /* GRRRRR */ - if (iocb->ki_nbytes == 0) + if (count == 0) return 0; appending = file->f_flags & O_APPEND ? 1 : 0; @@ -2330,8 +2330,7 @@ relock: } can_do_direct = direct_io; - ret = ocfs2_prepare_inode_for_write(file, ppos, - iocb->ki_nbytes, appending, + ret = ocfs2_prepare_inode_for_write(file, ppos, count, appending, &can_do_direct, &has_refcount); if (ret < 0) { mlog_errno(ret); @@ -2339,8 +2338,7 @@ relock: } if (direct_io && !is_sync_kiocb(iocb)) - unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes, - *ppos); + unaligned_dio = ocfs2_is_io_unaligned(inode, count, *ppos); /* * We can't complete the direct I/O as requested, fall back to diff --git a/fs/read_write.c b/fs/read_write.c index 8e1b68786d66..f8b8fc1316ab 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -343,7 +343,6 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos) init_sync_kiocb(&kiocb, file); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = iov_iter_count(iter); iter->type |= READ; ret = file->f_op->read_iter(&kiocb, iter); @@ -366,7 +365,6 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) init_sync_kiocb(&kiocb, file); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = iov_iter_count(iter); iter->type |= WRITE; ret = file->f_op->write_iter(&kiocb, iter); @@ -426,7 +424,6 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) @@ -446,7 +443,6 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; iov_iter_init(&iter, READ, &iov, 1, len); ret = filp->f_op->read_iter(&kiocb, &iter); @@ -510,7 +506,6 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) @@ -530,7 +525,6 @@ ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, lo init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; iov_iter_init(&iter, WRITE, &iov, 1, len); ret = filp->f_op->write_iter(&kiocb, &iter); @@ -719,7 +713,6 @@ static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iove init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; iov_iter_init(&iter, rw, iov, nr_segs, len); ret = fn(&kiocb, &iter); @@ -737,7 +730,6 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); if (ret == -EIOCBQUEUED) diff --git a/fs/udf/file.c b/fs/udf/file.c index 08f3555fbeac..9c0b6da9dbb3 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); int err, pos; - size_t count = iocb->ki_nbytes; + size_t count = iov_iter_count(from); struct udf_inode_info *iinfo = UDF_I(inode); mutex_lock(&inode->i_mutex); diff --git a/include/linux/aio.h b/include/linux/aio.h index d9c92daa3944..132d1ecba435 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -42,7 +42,6 @@ struct kiocb { __u64 ki_user_data; /* user's data for completion */ loff_t ki_pos; - size_t ki_nbytes; /* copy of iocb->aio_nbytes */ struct list_head ki_list; /* the aio core uses this * for cancellation */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c06df7de0963..60b2aa2a2da9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -521,7 +521,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) int i; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ - size_t len = iocb->ki_nbytes; + size_t len = iov_iter_count(from); ssize_t ret = len; if (len > LOG_LINE_MAX) diff --git a/mm/page_io.c b/mm/page_io.c index e6045804c8d8..7ef21577856c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -274,7 +274,6 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE); init_sync_kiocb(&kiocb, swap_file); kiocb.ki_pos = page_file_offset(page); - kiocb.ki_nbytes = PAGE_SIZE; set_page_writeback(page); unlock_page(page); diff --git a/net/socket.c b/net/socket.c index bbedbfcb42c2..f92145554f34 100644 --- a/net/socket.c +++ b/net/socket.c @@ -858,11 +858,11 @@ static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) if (iocb->ki_pos != 0) return -ESPIPE; - if (iocb->ki_nbytes == 0) /* Match SYS5 behaviour */ + if (!iov_iter_count(to)) /* Match SYS5 behaviour */ return 0; res = __sock_recvmsg(iocb, sock, &msg, - iocb->ki_nbytes, msg.msg_flags); + iov_iter_count(to), msg.msg_flags); *to = msg.msg_iter; return res; } @@ -883,7 +883,7 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; - res = __sock_sendmsg(iocb, sock, &msg, iocb->ki_nbytes); + res = __sock_sendmsg(iocb, sock, &msg, iov_iter_count(from)); *from = msg.msg_iter; return res; } From 9d5722b7777e64de2d932f46cfee7765fdcc60d6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Feb 2015 14:59:43 +0100 Subject: [PATCH 02/17] fuse: handle synchronous iocbs internally Based on a patch from Maxim Patlasov . Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/fuse/file.c | 51 +++++++++++++++++++++++++++++------------------- fs/fuse/fuse_i.h | 1 + 2 files changed, 32 insertions(+), 20 deletions(-) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c01ec3bdcfd8..f81d83eb9758 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -528,6 +528,17 @@ static void fuse_release_user_pages(struct fuse_req *req, int write) } } +static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) +{ + if (io->err) + return io->err; + + if (io->bytes >= 0 && io->write) + return -EIO; + + return io->bytes < 0 ? io->size : io->bytes; +} + /** * In case of short read, the caller sets 'pos' to the position of * actual end of fuse request in IO request. Otherwise, if bytes_requested @@ -546,6 +557,7 @@ static void fuse_release_user_pages(struct fuse_req *req, int write) */ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) { + bool is_sync = is_sync_kiocb(io->iocb); int left; spin_lock(&io->lock); @@ -555,27 +567,21 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) io->bytes = pos; left = --io->reqs; + if (!left && is_sync) + complete(io->done); spin_unlock(&io->lock); - if (!left) { - long res; + if (!left && !is_sync) { + ssize_t res = fuse_get_res_by_io(io); - if (io->err) - res = io->err; - else if (io->bytes >= 0 && io->write) - res = -EIO; - else { - res = io->bytes < 0 ? io->size : io->bytes; + if (res >= 0) { + struct inode *inode = file_inode(io->iocb->ki_filp); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); - if (!is_sync_kiocb(io->iocb)) { - struct inode *inode = file_inode(io->iocb->ki_filp); - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; - spin_unlock(&fc->lock); - } + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + spin_unlock(&fc->lock); } aio_complete(io->iocb, res, 0); @@ -2801,6 +2807,7 @@ static ssize_t fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { + DECLARE_COMPLETION_ONSTACK(wait); ssize_t ret = 0; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; @@ -2852,6 +2859,9 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE) io->async = false; + if (io->async && is_sync_kiocb(iocb)) + io->done = &wait; + if (rw == WRITE) ret = __fuse_direct_write(io, iter, &pos); else @@ -2864,11 +2874,12 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, if (!is_sync_kiocb(iocb)) return -EIOCBQUEUED; - ret = wait_on_sync_kiocb(iocb); - } else { - kfree(io); + wait_for_completion(&wait); + ret = fuse_get_res_by_io(io); } + kfree(io); + if (rw == WRITE) { if (ret > 0) fuse_write_update_size(inode, pos); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1cdfb07c1376..7354dc142a50 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -263,6 +263,7 @@ struct fuse_io_priv { int err; struct kiocb *iocb; struct file *file; + struct completion *done; }; /** From 599bd19bdc4c6b20fd91d50f2f79dececbaf80c1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Feb 2015 19:59:44 +0100 Subject: [PATCH 03/17] fs: don't allow to complete sync iocbs through aio_complete The AIO interface is fairly complex because it tries to allow filesystems to always work async and then wakeup a synchronous caller through aio_complete. It turns out that basically no one was doing this to avoid the complexity and context switches, and we've already fixed up the remaining users and can now get rid of this case. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/aio.c | 24 +----------------------- fs/ecryptfs/file.c | 6 ------ fs/read_write.c | 26 ++++++++------------------ include/linux/aio.h | 4 ---- net/socket.c | 9 +++------ 5 files changed, 12 insertions(+), 57 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 667054c7c067..8ca8df1c3550 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -778,22 +778,6 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, return 0; } -/* wait_on_sync_kiocb: - * Waits on the given sync kiocb to complete. - */ -ssize_t wait_on_sync_kiocb(struct kiocb *req) -{ - while (!req->ki_ctx) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (req->ki_ctx) - break; - io_schedule(); - } - __set_current_state(TASK_RUNNING); - return req->ki_user_data; -} -EXPORT_SYMBOL(wait_on_sync_kiocb); - /* * exit_aio: called when the last user of mm goes away. At this point, there is * no way for any new requests to be submited or any of the io_* syscalls to be @@ -1025,13 +1009,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) * ref, no other paths have a way to get another ref * - the sync task helpfully left a reference to itself in the iocb */ - if (is_sync_kiocb(iocb)) { - iocb->ki_user_data = res; - smp_wmb(); - iocb->ki_ctx = ERR_PTR(-EXDEV); - wake_up_process(iocb->ki_obj.tsk); - return; - } + BUG_ON(is_sync_kiocb(iocb)); if (iocb->ki_list.next) { unsigned long flags; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 6f4e659f508f..a36da8841e0c 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -52,12 +52,6 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, struct file *file = iocb->ki_filp; rc = generic_file_read_iter(iocb, to); - /* - * Even though this is a async interface, we need to wait - * for IO to finish to update atime - */ - if (-EIOCBQUEUED == rc) - rc = wait_on_sync_kiocb(iocb); if (rc >= 0) { path = ecryptfs_dentry_to_lower_path(file->f_path.dentry); touch_atime(path); diff --git a/fs/read_write.c b/fs/read_write.c index f8b8fc1316ab..76e324e8ce8d 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -346,9 +346,7 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos) iter->type |= READ; ret = file->f_op->read_iter(&kiocb, iter); - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); - + BUG_ON(ret == -EIOCBQUEUED); if (ret > 0) *ppos = kiocb.ki_pos; return ret; @@ -368,9 +366,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) iter->type |= WRITE; ret = file->f_op->write_iter(&kiocb, iter); - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); - + BUG_ON(ret == -EIOCBQUEUED); if (ret > 0) *ppos = kiocb.ki_pos; return ret; @@ -426,8 +422,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp kiocb.ki_pos = *ppos; ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); + BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; } @@ -446,8 +441,7 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p iov_iter_init(&iter, READ, &iov, 1, len); ret = filp->f_op->read_iter(&kiocb, &iter); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); + BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; } @@ -508,8 +502,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof kiocb.ki_pos = *ppos; ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); + BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; } @@ -528,8 +521,7 @@ ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, lo iov_iter_init(&iter, WRITE, &iov, 1, len); ret = filp->f_op->write_iter(&kiocb, &iter); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); + BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; } @@ -716,8 +708,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iove iov_iter_init(&iter, rw, iov, nr_segs, len); ret = fn(&kiocb, &iter); - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); + BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; } @@ -732,8 +723,7 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, kiocb.ki_pos = *ppos; ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); + BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; } diff --git a/include/linux/aio.h b/include/linux/aio.h index 132d1ecba435..f8516430490d 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -37,7 +37,6 @@ struct kiocb { union { void __user *user; - struct task_struct *tsk; } ki_obj; __u64 ki_user_data; /* user's data for completion */ @@ -63,13 +62,11 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) *kiocb = (struct kiocb) { .ki_ctx = NULL, .ki_filp = filp, - .ki_obj.tsk = current, }; } /* prototypes */ #ifdef CONFIG_AIO -extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb); extern void aio_complete(struct kiocb *iocb, long res, long res2); struct mm_struct; extern void exit_aio(struct mm_struct *mm); @@ -77,7 +74,6 @@ extern long do_io_submit(aio_context_t ctx_id, long nr, struct iocb __user *__user *iocbpp, bool compat); void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); #else -static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } static inline void aio_complete(struct kiocb *iocb, long res, long res2) { } struct mm_struct; static inline void exit_aio(struct mm_struct *mm) { } diff --git a/net/socket.c b/net/socket.c index f92145554f34..f6c519d7b3ba 100644 --- a/net/socket.c +++ b/net/socket.c @@ -633,8 +633,7 @@ static int do_sock_sendmsg(struct socket *sock, struct msghdr *msg, init_sync_kiocb(&iocb, NULL); ret = nosec ? __sock_sendmsg_nosec(&iocb, sock, msg, size) : __sock_sendmsg(&iocb, sock, msg, size); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&iocb); + BUG_ON(ret == -EIOCBQUEUED); return ret; } @@ -766,8 +765,7 @@ int sock_recvmsg(struct socket *sock, struct msghdr *msg, init_sync_kiocb(&iocb, NULL); ret = __sock_recvmsg(&iocb, sock, msg, size, flags); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&iocb); + BUG_ON(ret == -EIOCBQUEUED); return ret; } EXPORT_SYMBOL(sock_recvmsg); @@ -780,8 +778,7 @@ static int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, init_sync_kiocb(&iocb, NULL); ret = __sock_recvmsg_nosec(&iocb, sock, msg, size, flags); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&iocb); + BUG_ON(ret == -EIOCBQUEUED); return ret; } From 04b2fa9f8f36ec6fb6fd1c9dc9df6fff0cd27323 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 2 Feb 2015 14:49:06 +0100 Subject: [PATCH 04/17] fs: split generic and aio kiocb Most callers in the kernel want to perform synchronous file I/O, but still have to bloat the stack with a full struct kiocb. Split out the parts needed in filesystem code from those in the aio code, and only allocate those needed to pass down argument on the stack. The aio code embedds the generic iocb in the one it allocates and can easily get back to it by using container_of. Also add a ->ki_complete method to struct kiocb, this is used to call into the aio code and thus removes the dependency on aio for filesystems impementing asynchronous operations. It will also allow other callers to substitute their own completion callback. We also add a new ->ki_flags field to work around the nasty layering violation recently introduced in commit 5e33f6 ("usb: gadget: ffs: add eventfd notification about ffs events"). Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- drivers/usb/gadget/function/f_fs.c | 5 +- drivers/usb/gadget/legacy/inode.c | 5 +- fs/aio.c | 94 +++++++++++++++++++++--------- fs/direct-io.c | 4 +- fs/fuse/file.c | 2 +- fs/nfs/direct.c | 2 +- include/linux/aio.h | 46 ++------------- 7 files changed, 81 insertions(+), 77 deletions(-) diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 175c9956cbe3..b64538b498dc 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -655,9 +655,10 @@ static void ffs_user_copy_worker(struct work_struct *work) unuse_mm(io_data->mm); } - aio_complete(io_data->kiocb, ret, ret); + io_data->kiocb->ki_complete(io_data->kiocb, ret, ret); - if (io_data->ffs->ffs_eventfd && !io_data->kiocb->ki_eventfd) + if (io_data->ffs->ffs_eventfd && + !(io_data->kiocb->ki_flags & IOCB_EVENTFD)) eventfd_signal(io_data->ffs->ffs_eventfd, 1); usb_ep_free_request(io_data->ep, io_data->req); diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 200f9a584064..a4a80694f607 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -469,7 +469,7 @@ static void ep_user_copy_worker(struct work_struct *work) ret = -EFAULT; /* completing the iocb can drop the ctx and mm, don't touch mm after */ - aio_complete(iocb, ret, ret); + iocb->ki_complete(iocb, ret, ret); kfree(priv->buf); kfree(priv->to_free); @@ -497,7 +497,8 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) kfree(priv); iocb->private = NULL; /* aio_complete() reports bytes-transferred _and_ faults */ - aio_complete(iocb, req->actual ? req->actual : req->status, + + iocb->ki_complete(iocb, req->actual ? req->actual : req->status, req->status); } else { /* ep_copy_to_user() won't report both; we hide some faults */ diff --git a/fs/aio.c b/fs/aio.c index 8ca8df1c3550..958286536a10 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -151,6 +151,38 @@ struct kioctx { unsigned id; }; +/* + * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either + * cancelled or completed (this makes a certain amount of sense because + * successful cancellation - io_cancel() - does deliver the completion to + * userspace). + * + * And since most things don't implement kiocb cancellation and we'd really like + * kiocb completion to be lockless when possible, we use ki_cancel to + * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED + * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). + */ +#define KIOCB_CANCELLED ((void *) (~0ULL)) + +struct aio_kiocb { + struct kiocb common; + + struct kioctx *ki_ctx; + kiocb_cancel_fn *ki_cancel; + + struct iocb __user *ki_user_iocb; /* user's aiocb */ + __u64 ki_user_data; /* user's data for completion */ + + struct list_head ki_list; /* the aio core uses this + * for cancellation */ + + /* + * If the aio_resfd field of the userspace iocb is not zero, + * this is the underlying eventfd context to deliver events to. + */ + struct eventfd_ctx *ki_eventfd; +}; + /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); unsigned long aio_nr; /* current system wide number of aio requests */ @@ -220,7 +252,7 @@ static int __init aio_setup(void) if (IS_ERR(aio_mnt)) panic("Failed to create aio fs mount."); - kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); + kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); @@ -480,8 +512,9 @@ static int aio_setup_ring(struct kioctx *ctx) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) +void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { + struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common); struct kioctx *ctx = req->ki_ctx; unsigned long flags; @@ -496,7 +529,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) } EXPORT_SYMBOL(kiocb_set_cancel_fn); -static int kiocb_cancel(struct kiocb *kiocb) +static int kiocb_cancel(struct aio_kiocb *kiocb) { kiocb_cancel_fn *old, *cancel; @@ -514,7 +547,7 @@ static int kiocb_cancel(struct kiocb *kiocb) cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); } while (cancel != old); - return cancel(kiocb); + return cancel(&kiocb->common); } static void free_ioctx(struct work_struct *work) @@ -550,13 +583,13 @@ static void free_ioctx_reqs(struct percpu_ref *ref) static void free_ioctx_users(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, users); - struct kiocb *req; + struct aio_kiocb *req; spin_lock_irq(&ctx->ctx_lock); while (!list_empty(&ctx->active_reqs)) { req = list_first_entry(&ctx->active_reqs, - struct kiocb, ki_list); + struct aio_kiocb, ki_list); list_del_init(&req->ki_list); kiocb_cancel(req); @@ -932,9 +965,9 @@ static void user_refill_reqs_available(struct kioctx *ctx) * Allocate a slot for an aio request. * Returns NULL if no requests are free. */ -static inline struct kiocb *aio_get_req(struct kioctx *ctx) +static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { - struct kiocb *req; + struct aio_kiocb *req; if (!get_reqs_available(ctx)) { user_refill_reqs_available(ctx); @@ -955,10 +988,10 @@ out_put: return NULL; } -static void kiocb_free(struct kiocb *req) +static void kiocb_free(struct aio_kiocb *req) { - if (req->ki_filp) - fput(req->ki_filp); + if (req->common.ki_filp) + fput(req->common.ki_filp); if (req->ki_eventfd != NULL) eventfd_ctx_put(req->ki_eventfd); kmem_cache_free(kiocb_cachep, req); @@ -994,8 +1027,9 @@ out: /* aio_complete * Called when the io request on the given iocb is complete. */ -void aio_complete(struct kiocb *iocb, long res, long res2) +static void aio_complete(struct kiocb *kiocb, long res, long res2) { + struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common); struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; @@ -1009,7 +1043,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) * ref, no other paths have a way to get another ref * - the sync task helpfully left a reference to itself in the iocb */ - BUG_ON(is_sync_kiocb(iocb)); + BUG_ON(is_sync_kiocb(kiocb)); if (iocb->ki_list.next) { unsigned long flags; @@ -1035,7 +1069,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; - event->obj = (u64)(unsigned long)iocb->ki_obj.user; + event->obj = (u64)(unsigned long)iocb->ki_user_iocb; event->data = iocb->ki_user_data; event->res = res; event->res2 = res2; @@ -1044,7 +1078,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, + ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data, res, res2); /* after flagging the request as done, we @@ -1091,7 +1125,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) percpu_ref_put(&ctx->reqs); } -EXPORT_SYMBOL(aio_complete); /* aio_read_events_ring * Pull an event off of the ioctx's event ring. Returns the number of @@ -1480,7 +1513,7 @@ rw_common: static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, struct iocb *iocb, bool compat) { - struct kiocb *req; + struct aio_kiocb *req; ssize_t ret; /* enforce forwards compatibility on users */ @@ -1503,11 +1536,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, if (unlikely(!req)) return -EAGAIN; - req->ki_filp = fget(iocb->aio_fildes); - if (unlikely(!req->ki_filp)) { + req->common.ki_filp = fget(iocb->aio_fildes); + if (unlikely(!req->common.ki_filp)) { ret = -EBADF; goto out_put_req; } + req->common.ki_pos = iocb->aio_offset; + req->common.ki_complete = aio_complete; + req->common.ki_flags = 0; if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* @@ -1522,6 +1558,8 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_eventfd = NULL; goto out_put_req; } + + req->common.ki_flags |= IOCB_EVENTFD; } ret = put_user(KIOCB_KEY, &user_iocb->aio_key); @@ -1530,11 +1568,10 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, goto out_put_req; } - req->ki_obj.user = user_iocb; + req->ki_user_iocb = user_iocb; req->ki_user_data = iocb->aio_data; - req->ki_pos = iocb->aio_offset; - ret = aio_run_iocb(req, iocb->aio_lio_opcode, + ret = aio_run_iocb(&req->common, iocb->aio_lio_opcode, (char __user *)(unsigned long)iocb->aio_buf, iocb->aio_nbytes, compat); @@ -1623,10 +1660,10 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, /* lookup_kiocb * Finds a given iocb for cancellation. */ -static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, - u32 key) +static struct aio_kiocb * +lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key) { - struct list_head *pos; + struct aio_kiocb *kiocb; assert_spin_locked(&ctx->ctx_lock); @@ -1634,9 +1671,8 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, return NULL; /* TODO: use a hash or array, this sucks. */ - list_for_each(pos, &ctx->active_reqs) { - struct kiocb *kiocb = list_kiocb(pos); - if (kiocb->ki_obj.user == iocb) + list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { + if (kiocb->ki_user_iocb == iocb) return kiocb; } return NULL; @@ -1656,7 +1692,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { struct kioctx *ctx; - struct kiocb *kiocb; + struct aio_kiocb *kiocb; u32 key; int ret; diff --git a/fs/direct-io.c b/fs/direct-io.c index e181b6b2e297..c38b460776e6 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -265,7 +265,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, ret = err; } - aio_complete(dio->iocb, ret, 0); + dio->iocb->ki_complete(dio->iocb, ret, 0); } kmem_cache_free(dio_cache, dio); @@ -1056,7 +1056,7 @@ static inline int drop_refcount(struct dio *dio) * operation. AIO can if it was a broken operation described above or * in fact if all the bios race to complete before we get here. In * that case dio_complete() translates the EIOCBQUEUED into the proper - * return code that the caller will hand to aio_complete(). + * return code that the caller will hand to ->complete(). * * This is managed by the bio_lock instead of being an atomic_t so that * completion paths can drop their ref and use the remaining count to diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f81d83eb9758..a5c5e38b3ff8 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -584,7 +584,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) spin_unlock(&fc->lock); } - aio_complete(io->iocb, res, 0); + io->iocb->ki_complete(io->iocb, res, 0); kfree(io); } } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 27cebf164070..5db3385fc108 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -393,7 +393,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) long res = (long) dreq->error; if (!res) res = (long) dreq->count; - aio_complete(dreq->iocb, res, 0); + dreq->iocb->ki_complete(dreq->iocb, res, 0); } complete_all(&dreq->completion); diff --git a/include/linux/aio.h b/include/linux/aio.h index f8516430490d..5c40b61285ac 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -14,67 +14,38 @@ struct kiocb; #define KIOCB_KEY 0 -/* - * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either - * cancelled or completed (this makes a certain amount of sense because - * successful cancellation - io_cancel() - does deliver the completion to - * userspace). - * - * And since most things don't implement kiocb cancellation and we'd really like - * kiocb completion to be lockless when possible, we use ki_cancel to - * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED - * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). - */ -#define KIOCB_CANCELLED ((void *) (~0ULL)) - typedef int (kiocb_cancel_fn)(struct kiocb *); +#define IOCB_EVENTFD (1 << 0) + struct kiocb { struct file *ki_filp; - struct kioctx *ki_ctx; /* NULL for sync ops */ - kiocb_cancel_fn *ki_cancel; - void *private; - - union { - void __user *user; - } ki_obj; - - __u64 ki_user_data; /* user's data for completion */ loff_t ki_pos; - - struct list_head ki_list; /* the aio core uses this - * for cancellation */ - - /* - * If the aio_resfd field of the userspace iocb is not zero, - * this is the underlying eventfd context to deliver events to. - */ - struct eventfd_ctx *ki_eventfd; + void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); + void *private; + int ki_flags; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) { - return kiocb->ki_ctx == NULL; + return kiocb->ki_complete == NULL; } static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) { *kiocb = (struct kiocb) { - .ki_ctx = NULL, .ki_filp = filp, }; } /* prototypes */ #ifdef CONFIG_AIO -extern void aio_complete(struct kiocb *iocb, long res, long res2); struct mm_struct; extern void exit_aio(struct mm_struct *mm); extern long do_io_submit(aio_context_t ctx_id, long nr, struct iocb __user *__user *iocbpp, bool compat); void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); #else -static inline void aio_complete(struct kiocb *iocb, long res, long res2) { } struct mm_struct; static inline void exit_aio(struct mm_struct *mm) { } static inline long do_io_submit(aio_context_t ctx_id, long nr, @@ -84,11 +55,6 @@ static inline void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) { } #endif /* CONFIG_AIO */ -static inline struct kiocb *list_kiocb(struct list_head *h) -{ - return list_entry(h, struct kiocb, ki_list); -} - /* for sysctl: */ extern unsigned long aio_nr; extern unsigned long aio_max_nr; From e2e40f2c1ed433c5e224525c8c862fd32e5d3df2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 22 Feb 2015 08:58:50 -0800 Subject: [PATCH 05/17] fs: move struct kiocb to fs.h struct kiocb now is a generic I/O container, so move it to fs.h. Also do a #include diet for aio.h while we're at it. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- arch/s390/hypfs/inode.c | 2 +- drivers/char/mem.c | 2 +- drivers/char/tile-srom.c | 1 - drivers/infiniband/hw/ipath/ipath_file_ops.c | 1 - drivers/infiniband/hw/qib/qib_file_ops.c | 1 - drivers/misc/mei/amthif.c | 1 - drivers/misc/mei/main.c | 1 - drivers/misc/mei/pci-me.c | 1 - drivers/scsi/sg.c | 2 +- drivers/staging/unisys/include/timskmod.h | 1 - drivers/usb/gadget/function/f_fs.c | 1 + drivers/usb/gadget/legacy/inode.c | 1 + fs/9p/vfs_addr.c | 2 +- fs/affs/file.c | 2 +- fs/afs/write.c | 1 - fs/bfs/inode.c | 1 + fs/block_dev.c | 1 - fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 2 +- fs/ceph/file.c | 1 - fs/direct-io.c | 1 - fs/ecryptfs/file.c | 1 - fs/ext2/inode.c | 2 +- fs/ext3/inode.c | 2 +- fs/ext4/file.c | 2 +- fs/ext4/indirect.c | 2 +- fs/ext4/inode.c | 1 - fs/ext4/page-io.c | 1 - fs/f2fs/data.c | 2 +- fs/fat/inode.c | 1 - fs/fuse/cuse.c | 2 +- fs/fuse/dev.c | 1 - fs/fuse/file.c | 2 +- fs/gfs2/aops.c | 2 +- fs/gfs2/file.c | 1 - fs/hfs/inode.c | 2 +- fs/hfsplus/inode.c | 2 +- fs/jfs/inode.c | 2 +- fs/nfs/file.c | 1 - fs/nilfs2/inode.c | 2 +- fs/ntfs/file.c | 1 - fs/ntfs/inode.c | 1 - fs/ocfs2/aops.c | 1 + fs/ocfs2/aops.h | 2 +- fs/pipe.c | 1 - fs/read_write.c | 1 - fs/reiserfs/inode.c | 2 +- fs/splice.c | 1 - fs/ubifs/file.c | 1 - fs/udf/file.c | 2 +- fs/udf/inode.c | 2 +- fs/xfs/xfs_aops.c | 1 - fs/xfs/xfs_file.c | 1 - include/linux/aio.h | 31 +------------------- include/linux/fs.h | 22 ++++++++++++++ include/net/sock.h | 1 - kernel/printk/printk.c | 2 +- kernel/sysctl.c | 1 + mm/filemap.c | 1 - mm/page_io.c | 2 +- mm/shmem.c | 2 +- net/ipv4/raw.c | 1 - sound/core/pcm_native.c | 2 +- 63 files changed, 55 insertions(+), 86 deletions(-) diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 4c8008dd938e..ad66b07f742e 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include "hypfs.h" diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 297110c12635..9c4fd7a8e2e5 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include diff --git a/drivers/char/tile-srom.c b/drivers/char/tile-srom.c index 02e76ac6d282..69f6b4acc377 100644 --- a/drivers/char/tile-srom.c +++ b/drivers/char/tile-srom.c @@ -27,7 +27,6 @@ #include /* size_t */ #include #include /* O_ACCMODE */ -#include #include #include #include diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 6d7f453b4d05..aed8afee56da 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index b15e34eeef68..826c17ee4c7d 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/misc/mei/amthif.c b/drivers/misc/mei/amthif.c index c4cb9a984a5f..40ea639fa413 100644 --- a/drivers/misc/mei/amthif.c +++ b/drivers/misc/mei/amthif.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c index 3c019c0e60eb..47680c84801c 100644 --- a/drivers/misc/mei/main.c +++ b/drivers/misc/mei/main.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index bd3039ab8f98..af44ee26075d 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 0cbc1fb45f10..c78a6f7bbc20 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -33,7 +33,6 @@ static int sg_version_num = 30536; /* 2 digits for each component */ #include #include #include -#include #include #include #include @@ -51,6 +50,7 @@ static int sg_version_num = 30536; /* 2 digits for each component */ #include #include #include +#include #include "scsi.h" #include diff --git a/drivers/staging/unisys/include/timskmod.h b/drivers/staging/unisys/include/timskmod.h index 4019a0d63645..52648d4d9922 100644 --- a/drivers/staging/unisys/include/timskmod.h +++ b/drivers/staging/unisys/include/timskmod.h @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index b64538b498dc..a12315a78248 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index a4a80694f607..662ef2c1c62b 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index eb14e055ea83..ff1a5bac4200 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include diff --git a/fs/affs/file.c b/fs/affs/file.c index d2468bf95669..33eaa67bb026 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -12,7 +12,7 @@ * affs regular file handling primitives */ -#include +#include #include "affs.h" static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); diff --git a/fs/afs/write.c b/fs/afs/write.c index c13cb08964ed..0714abcd7f32 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -14,7 +14,6 @@ #include #include #include -#include #include "internal.h" static int afs_write_back_from_locked_page(struct afs_writeback *wb, diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 90bc079d9982..fdcb4d69f430 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "bfs.h" diff --git a/fs/block_dev.c b/fs/block_dev.c index 975266be67d3..2e522aed6584 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include "internal.h" diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b78bbbac900d..69c9508d2c7e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -32,6 +31,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 54bcf639d1cf..b214ab178f3a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -43,6 +42,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 081c4e3f9e49..98e257c1b5b1 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include "super.h" diff --git a/fs/direct-io.c b/fs/direct-io.c index c38b460776e6..6fb00e3f1059 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -37,7 +37,6 @@ #include #include #include -#include /* * How many user pages to map in one call to get_user_pages(). This determines diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index a36da8841e0c..273d36e3f0c0 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -31,7 +31,6 @@ #include #include #include -#include #include "ecryptfs_kernel.h" /** diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 6434bc000125..df9d6afbc5d5 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include "ext2.h" #include "acl.h" #include "xattr.h" diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2c6ccc49ba27..db07ffbe7c85 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include "ext3.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 33a09da16c9c..598abbbe6786 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -23,9 +23,9 @@ #include #include #include -#include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 6b9878a24182..8611640856d3 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -20,9 +20,9 @@ * (sct@redhat.com), 1993, 1998 */ -#include #include "ext4_jbd2.h" #include "truncate.h" +#include #include diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 85404f15e53a..6325d2c1a65c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include "ext4_jbd2.h" diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index b24a2541a9ba..464984261e69 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 985ed023a750..497f8515d205 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -12,12 +12,12 @@ #include #include #include -#include #include #include #include #include #include +#include #include "f2fs.h" #include "node.h" diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 497c7c5263c7..8521207de229 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 28d0c7abba1c..b3fa05032234 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include @@ -48,6 +47,7 @@ #include #include #include +#include #include "fuse_i.h" diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ed19a7d622fa..8c92c727ddd6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -19,7 +19,6 @@ #include #include #include -#include MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a5c5e38b3ff8..ff102cbf16ea 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -15,8 +15,8 @@ #include #include #include -#include #include +#include static const struct file_operations fuse_direct_io_file_operations; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 4ad4f94edebe..fe6634d25d1d 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include "gfs2.h" diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 3e32bb8e2d7e..f6fc412b1100 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include "gfs2.h" diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index d0929bc81782..98d4ea45bb70 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "hfs_fs.h" #include "btree.h" diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 0cf786f2d046..f541196d4ee9 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "hfsplus_fs.h" #include "hfsplus_raw.h" diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index bd3df1ca3c9b..3197aed10614 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -22,8 +22,8 @@ #include #include #include +#include #include -#include #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 94712fc781fa..5d8b89cb13c0 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 8b5969538f39..ab4987bc637f 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include "nilfs.h" #include "btnode.h" #include "segment.h" diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 1da9b2d184dc..f16f2d8401fe 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 898b9949d363..1d0c21df0d80 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -28,7 +28,6 @@ #include #include #include -#include #include "aops.h" #include "attrib.h" diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 44db1808cdb5..e1bf18c5d25e 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -29,6 +29,7 @@ #include #include #include +#include #include diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 6cae155d54df..dd59599b022d 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -22,7 +22,7 @@ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H -#include +#include handle_t *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, diff --git a/fs/pipe.c b/fs/pipe.c index 21981e58e2a6..2d084f2d0b83 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/fs/read_write.c b/fs/read_write.c index 76e324e8ce8d..99a6ef946d01 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index e72401e1f995..9312b7842e03 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); diff --git a/fs/splice.c b/fs/splice.c index 7968da96bebb..4bbfa95b5bfe 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -32,7 +32,6 @@ #include #include #include -#include #include "internal.h" /* diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index e627c0acf626..c3d15fe83403 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -50,7 +50,6 @@ */ #include "ubifs.h" -#include #include #include #include diff --git a/fs/udf/file.c b/fs/udf/file.c index 9c0b6da9dbb3..7f885cc8b0b7 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/udf/inode.c b/fs/udf/inode.c index a445d599098d..9c1fbd23913d 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3a9b7a1b8704..4f8cdc59bc38 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -31,7 +31,6 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" -#include #include #include #include diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1cdba95c78cb..f527618cb42b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -37,7 +37,6 @@ #include "xfs_log.h" #include "xfs_icache.h" -#include #include #include #include diff --git a/include/linux/aio.h b/include/linux/aio.h index 5c40b61285ac..9eb42dbc5582 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -1,52 +1,23 @@ #ifndef __LINUX__AIO_H #define __LINUX__AIO_H -#include -#include #include -#include -#include - -#include struct kioctx; struct kiocb; +struct mm_struct; #define KIOCB_KEY 0 typedef int (kiocb_cancel_fn)(struct kiocb *); -#define IOCB_EVENTFD (1 << 0) - -struct kiocb { - struct file *ki_filp; - loff_t ki_pos; - void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); - void *private; - int ki_flags; -}; - -static inline bool is_sync_kiocb(struct kiocb *kiocb) -{ - return kiocb->ki_complete == NULL; -} - -static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) -{ - *kiocb = (struct kiocb) { - .ki_filp = filp, - }; -} - /* prototypes */ #ifdef CONFIG_AIO -struct mm_struct; extern void exit_aio(struct mm_struct *mm); extern long do_io_submit(aio_context_t ctx_id, long nr, struct iocb __user *__user *iocbpp, bool compat); void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); #else -struct mm_struct; static inline void exit_aio(struct mm_struct *mm) { } static inline long do_io_submit(aio_context_t ctx_id, long nr, struct iocb __user * __user *iocbpp, diff --git a/include/linux/fs.h b/include/linux/fs.h index 447932aed1e1..48c1472bde4a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -314,6 +314,28 @@ struct page; struct address_space; struct writeback_control; +#define IOCB_EVENTFD (1 << 0) + +struct kiocb { + struct file *ki_filp; + loff_t ki_pos; + void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); + void *private; + int ki_flags; +}; + +static inline bool is_sync_kiocb(struct kiocb *kiocb) +{ + return kiocb->ki_complete == NULL; +} + +static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) +{ + *kiocb = (struct kiocb) { + .ki_filp = filp, + }; +} + /* * "descriptor" for what we're up to with a read. * This allows us to use the same read code yet diff --git a/include/net/sock.h b/include/net/sock.h index ab186b1d31ff..71c1300025e2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -57,7 +57,6 @@ #include #include #include -#include #include #include diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 60b2aa2a2da9..40d50cc4c686 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -46,6 +45,7 @@ #include #include #include +#include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 88ea2d6e0031..83d907afb4a6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -19,6 +19,7 @@ */ #include +#include #include #include #include diff --git a/mm/filemap.c b/mm/filemap.c index ad7242043bdb..876f4e6f3ed6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/page_io.c b/mm/page_io.c index 7ef21577856c..a96c8562d835 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -20,8 +20,8 @@ #include #include #include -#include #include +#include #include static struct bio *get_swap_bio(gfp_t gfp_flags, diff --git a/mm/shmem.c b/mm/shmem.c index a63031fa3e0c..944b94079bb0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include static struct vfsmount *shm_mnt; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index f027a708b7e0..4a356b7c081b 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index b03a638b420c..9ecff240a39b 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -35,6 +34,7 @@ #include #include #include +#include /* * Compatibility From bc917be8105993c256338ad1189650364a741483 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 21 Mar 2015 17:45:43 -0400 Subject: [PATCH 06/17] saner iov_iter initialization primitives iovec-backed iov_iter instances are assumed to satisfy several properties: * no more than UIO_MAXIOV elements in iovec array * total size of all ranges is no more than MAX_RW_COUNT * all ranges pass access_ok(). The problem is, invariants of data structures should be established in the primitives creating those data structures, not in the code using those primitives. And iov_iter_init() violates that principle. For a while we managed to get away with that, but once the use of iov_iter started to spread, it didn't take long for shit to hit the fan - missed check in sys_sendto() had introduced a roothole. We _do_ have primitives for importing and validating iovecs (both native and compat ones) and those primitives are almost always followed by shoving the resulting iovec into iov_iter. Life would be considerably simpler (and safer) if we combined those primitives with initializing iov_iter. That gives us two new primitives - import_iovec() and compat_import_iovec(). Calling conventions: iovec = iov_array; err = import_iovec(direction, uvec, nr_segs, ARRAY_SIZE(iov_array), &iovec, &iter); imports user vector into kernel space (into iov_array if it fits, allocated if it doesn't fit or if iovec was NULL), validates it and sets iter up to refer to it. On success 0 is returned and allocated kernel copy (or NULL if the array had fit into caller-supplied one) is returned via iovec. On failure all allocations are undone and -E... is returned. If the total size of ranges exceeds MAX_RW_COUNT, the excess is silently truncated. compat_import_iovec() expects uvec to be a pointer to user array of compat_iovec; otherwise it's identical to import_iovec(). Finally, import_single_range() sets iov_iter backed by single-element iovec covering a user-supplied range - err = import_single_range(direction, address, size, iovec, &iter); does validation and sets iter up. Again, size in excess of MAX_RW_COUNT gets silently truncated. Next commits will be switching the things up to use of those and reducing the amount of iov_iter_init() instances. Signed-off-by: Al Viro --- include/linux/uio.h | 14 +++++++++++ lib/iov_iter.c | 57 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/include/linux/uio.h b/include/linux/uio.h index 71880299ed48..1f4a37f1f025 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -139,4 +139,18 @@ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); +int import_iovec(int type, const struct iovec __user * uvector, + unsigned nr_segs, unsigned fast_segs, + struct iovec **iov, struct iov_iter *i); + +#ifdef CONFIG_COMPAT +struct compat_iovec; +int compat_import_iovec(int type, const struct compat_iovec __user * uvector, + unsigned nr_segs, unsigned fast_segs, + struct iovec **iov, struct iov_iter *i); +#endif + +int import_single_range(int type, void __user *buf, size_t len, + struct iovec *iov, struct iov_iter *i); + #endif diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 9d96e283520c..fc6e33f6b7f3 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -766,3 +766,60 @@ const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) flags); } EXPORT_SYMBOL(dup_iter); + +int import_iovec(int type, const struct iovec __user * uvector, + unsigned nr_segs, unsigned fast_segs, + struct iovec **iov, struct iov_iter *i) +{ + ssize_t n; + struct iovec *p; + n = rw_copy_check_uvector(type, uvector, nr_segs, fast_segs, + *iov, &p); + if (n < 0) { + if (p != *iov) + kfree(p); + *iov = NULL; + return n; + } + iov_iter_init(i, type, p, nr_segs, n); + *iov = p == *iov ? NULL : p; + return 0; +} +EXPORT_SYMBOL(import_iovec); + +#ifdef CONFIG_COMPAT +#include + +int compat_import_iovec(int type, const struct compat_iovec __user * uvector, + unsigned nr_segs, unsigned fast_segs, + struct iovec **iov, struct iov_iter *i) +{ + ssize_t n; + struct iovec *p; + n = compat_rw_copy_check_uvector(type, uvector, nr_segs, fast_segs, + *iov, &p); + if (n < 0) { + if (p != *iov) + kfree(p); + *iov = NULL; + return n; + } + iov_iter_init(i, type, p, nr_segs, n); + *iov = p == *iov ? NULL : p; + return 0; +} +#endif + +int import_single_range(int rw, void __user *buf, size_t len, + struct iovec *iov, struct iov_iter *i) +{ + if (len > MAX_RW_COUNT) + len = MAX_RW_COUNT; + if (unlikely(!access_ok(!rw, buf, len))) + return -EFAULT; + + iov->iov_base = buf; + iov->iov_len = len; + iov_iter_init(i, rw, iov, 1, len); + return 0; +} From 3af6878ecad229346fbc2c8f2663089aa6aef20d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 1 Apr 2015 14:06:00 +0100 Subject: [PATCH 07/17] RxRPC: Fix the conversion to iov_iter This commit: commit af2b040e470b470bfc881981db3c796072853eae Author: Al Viro Date: Thu Nov 27 21:44:24 2014 -0500 Subject: rxrpc: switch rxrpc_send_data() to iov_iter primitives incorrectly changes a do-while loop into a while loop in rxrpc_send_data(). Unfortunately, at least one pass through the loop is required - even if there is no data - so that the packet the closes the send phase can be sent if MSG_MORE is not set. Signed-off-by: David Howells --- net/rxrpc/ar-output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index 8331c95e1522..833a33b5c180 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -548,7 +548,7 @@ static int rxrpc_send_data(struct kiocb *iocb, copied = 0; if (len > iov_iter_count(&msg->msg_iter)) len = iov_iter_count(&msg->msg_iter); - while (len) { + do { int copy; if (!skb) { @@ -689,7 +689,7 @@ static int rxrpc_send_data(struct kiocb *iocb, rxrpc_queue_packet(call, skb, !iov_iter_count(&msg->msg_iter) && !more); skb = NULL; } - } + } while (len > 0); success: ret = copied; From aab94830a7fdf17aac07fea54d4cb43b0ad001b8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 1 Apr 2015 15:48:00 +0100 Subject: [PATCH 08/17] RxRPC: Don't call skb_add_data() if there's no data to copy Don't call skb_add_data() in rxrpc_send_data() if there's no data to copy and also skip the calculations associated with it in such a case. Signed-off-by: David Howells --- net/rxrpc/ar-output.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index 833a33b5c180..f48dc1aa4840 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -549,8 +549,6 @@ static int rxrpc_send_data(struct kiocb *iocb, if (len > iov_iter_count(&msg->msg_iter)) len = iov_iter_count(&msg->msg_iter); do { - int copy; - if (!skb) { size_t size, chunk, max, space; @@ -616,23 +614,25 @@ static int rxrpc_send_data(struct kiocb *iocb, sp = rxrpc_skb(skb); /* append next segment of data to the current buffer */ - copy = skb_tailroom(skb); - ASSERTCMP(copy, >, 0); - if (copy > len) - copy = len; - if (copy > sp->remain) - copy = sp->remain; + if (len > 0) { + int copy = skb_tailroom(skb); + ASSERTCMP(copy, >, 0); + if (copy > len) + copy = len; + if (copy > sp->remain) + copy = sp->remain; - _debug("add"); - ret = skb_add_data(skb, &msg->msg_iter, copy); - _debug("added"); - if (ret < 0) - goto efault; - sp->remain -= copy; - skb->mark += copy; - copied += copy; + _debug("add"); + ret = skb_add_data(skb, &msg->msg_iter, copy); + _debug("added"); + if (ret < 0) + goto efault; + sp->remain -= copy; + skb->mark += copy; + copied += copy; - len -= copy; + len -= copy; + } /* check for the far side aborting the call or a network error * occurring */ From 382d7974de31ef5e64dceee0d9cada3d3864b767 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 1 Apr 2015 15:43:26 +0100 Subject: [PATCH 09/17] RxRPC: Use iov_iter_count() in rxrpc_send_data() instead of the len argument Use iov_iter_count() in rxrpc_send_data() to get the remaining data length instead of using the len argument as the len argument is now redundant. Signed-off-by: David Howells --- net/rxrpc/ar-output.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index f48dc1aa4840..de8d2f1b08c5 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -546,8 +546,6 @@ static int rxrpc_send_data(struct kiocb *iocb, call->tx_pending = NULL; copied = 0; - if (len > iov_iter_count(&msg->msg_iter)) - len = iov_iter_count(&msg->msg_iter); do { if (!skb) { size_t size, chunk, max, space; @@ -570,8 +568,8 @@ static int rxrpc_send_data(struct kiocb *iocb, max &= ~(call->conn->size_align - 1UL); chunk = max; - if (chunk > len && !more) - chunk = len; + if (chunk > iov_iter_count(&msg->msg_iter) && !more) + chunk = iov_iter_count(&msg->msg_iter); space = chunk + call->conn->size_align; space &= ~(call->conn->size_align - 1UL); @@ -614,11 +612,11 @@ static int rxrpc_send_data(struct kiocb *iocb, sp = rxrpc_skb(skb); /* append next segment of data to the current buffer */ - if (len > 0) { + if (iov_iter_count(&msg->msg_iter) > 0) { int copy = skb_tailroom(skb); ASSERTCMP(copy, >, 0); - if (copy > len) - copy = len; + if (copy > iov_iter_count(&msg->msg_iter)) + copy = iov_iter_count(&msg->msg_iter); if (copy > sp->remain) copy = sp->remain; @@ -630,8 +628,6 @@ static int rxrpc_send_data(struct kiocb *iocb, sp->remain -= copy; skb->mark += copy; copied += copy; - - len -= copy; } /* check for the far side aborting the call or a network error @@ -640,7 +636,8 @@ static int rxrpc_send_data(struct kiocb *iocb, goto call_aborted; /* add the packet to the send queue if it's now full */ - if (sp->remain <= 0 || (!len && !more)) { + if (sp->remain <= 0 || + (iov_iter_count(&msg->msg_iter) == 0 && !more)) { struct rxrpc_connection *conn = call->conn; uint32_t seq; size_t pad; @@ -670,7 +667,7 @@ static int rxrpc_send_data(struct kiocb *iocb, sp->hdr.serviceId = conn->service_id; sp->hdr.flags = conn->out_clientflag; - if (len == 0 && !more) + if (iov_iter_count(&msg->msg_iter) == 0 && !more) sp->hdr.flags |= RXRPC_LAST_PACKET; else if (CIRC_SPACE(call->acks_head, call->acks_tail, call->acks_winsz) > 1) @@ -686,10 +683,11 @@ static int rxrpc_send_data(struct kiocb *iocb, memcpy(skb->head, &sp->hdr, sizeof(struct rxrpc_header)); - rxrpc_queue_packet(call, skb, !iov_iter_count(&msg->msg_iter) && !more); + rxrpc_queue_packet(call, skb, + iov_iter_count(&msg->msg_iter) == 0 && !more); skb = NULL; } - } while (len > 0); + } while (iov_iter_count(&msg->msg_iter) > 0); success: ret = copied; From bfd4e9562c8769feadba9b5262200448656b8048 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 1 Apr 2015 16:03:46 +0100 Subject: [PATCH 10/17] AFS: afs_send_empty_reply() doesn't require an iovec array afs_send_empty_reply() doesn't require an iovec array with which to initialise the msghdr, but can pass NULL instead. Suggested-by: Al Viro Signed-off-by: David Howells --- fs/afs/rxrpc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index dbc732e9a5c0..3a57a1b0fb51 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -770,15 +770,12 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, void afs_send_empty_reply(struct afs_call *call) { struct msghdr msg; - struct kvec iov[1]; _enter(""); - iov[0].iov_base = NULL; - iov[0].iov_len = 0; msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 0, 0); /* WTF? */ + iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; From 44ba06987c0b10faa998b9324850e8a6564c714d Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 1 Apr 2015 16:31:26 +0100 Subject: [PATCH 11/17] RxRPC: Handle VERSION Rx protocol packets Handle VERSION Rx protocol packets. We should respond to a VERSION packet with a string indicating the Rx version. This is a maximum of 64 characters and is padded out to 65 chars with NUL bytes. Note that other AFS clients use the version request as a NAT keepalive so we need to handle it rather than returning an abort. The standard formulation seems to be: built --
for example: " OpenAFS 1.6.2 built 2013-05-07 " (note the three extra spaces) as obtained with: rxdebug grand.mit.edu -version from the openafs package. Signed-off-by: David Howells --- include/rxrpc/packet.h | 3 +- net/rxrpc/ar-input.c | 23 +++++++++- net/rxrpc/ar-internal.h | 2 + net/rxrpc/ar-local.c | 98 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 124 insertions(+), 2 deletions(-) diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h index f2902ef7ab75..4dce116bfd80 100644 --- a/include/rxrpc/packet.h +++ b/include/rxrpc/packet.h @@ -47,7 +47,8 @@ struct rxrpc_header { #define RXRPC_PACKET_TYPE_CHALLENGE 6 /* connection security challenge (SRVR->CLNT) */ #define RXRPC_PACKET_TYPE_RESPONSE 7 /* connection secutity response (CLNT->SRVR) */ #define RXRPC_PACKET_TYPE_DEBUG 8 /* debug info request */ -#define RXRPC_N_PACKET_TYPES 9 /* number of packet types (incl type 0) */ +#define RXRPC_PACKET_TYPE_VERSION 13 /* version string request */ +#define RXRPC_N_PACKET_TYPES 14 /* number of packet types (incl type 0) */ uint8_t flags; /* packet flags */ #define RXRPC_CLIENT_INITIATED 0x01 /* signifies a packet generated by a client */ diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c index 481f89f93789..4505a691d88c 100644 --- a/net/rxrpc/ar-input.c +++ b/net/rxrpc/ar-input.c @@ -28,7 +28,7 @@ const char *rxrpc_pkts[] = { "?00", "DATA", "ACK", "BUSY", "ABORT", "ACKALL", "CHALL", "RESP", "DEBUG", - "?09", "?10", "?11", "?12", "?13", "?14", "?15" + "?09", "?10", "?11", "?12", "VERSION", "?14", "?15" }; /* @@ -593,6 +593,20 @@ static void rxrpc_post_packet_to_conn(struct rxrpc_connection *conn, rxrpc_queue_conn(conn); } +/* + * post endpoint-level events to the local endpoint + * - this includes debug and version messages + */ +static void rxrpc_post_packet_to_local(struct rxrpc_local *local, + struct sk_buff *skb) +{ + _enter("%p,%p", local, skb); + + atomic_inc(&local->usage); + skb_queue_tail(&local->event_queue, skb); + rxrpc_queue_work(&local->event_processor); +} + static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local, struct sk_buff *skb, struct rxrpc_skb_priv *sp) @@ -699,6 +713,11 @@ void rxrpc_data_ready(struct sock *sk) goto bad_message; } + if (sp->hdr.type == RXRPC_PACKET_TYPE_VERSION) { + rxrpc_post_packet_to_local(local, skb); + goto out; + } + if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA && (sp->hdr.callNumber == 0 || sp->hdr.seq == 0)) goto bad_message; @@ -731,6 +750,8 @@ void rxrpc_data_ready(struct sock *sk) else goto cant_route_call; } + +out: rxrpc_put_local(local); return; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index ba9fd36d3f15..9a4f7a26adc6 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -152,11 +152,13 @@ struct rxrpc_local { struct work_struct destroyer; /* endpoint destroyer */ struct work_struct acceptor; /* incoming call processor */ struct work_struct rejecter; /* packet reject writer */ + struct work_struct event_processor; /* endpoint event processor */ struct list_head services; /* services listening on this endpoint */ struct list_head link; /* link in endpoint list */ struct rw_semaphore defrag_sem; /* control re-enablement of IP DF bit */ struct sk_buff_head accept_queue; /* incoming calls awaiting acceptance */ struct sk_buff_head reject_queue; /* packets awaiting rejection */ + struct sk_buff_head event_queue; /* endpoint event packets awaiting processing */ spinlock_t lock; /* access lock */ rwlock_t services_lock; /* lock for services list */ atomic_t usage; diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c index 87f7135d238b..ca904ed5400a 100644 --- a/net/rxrpc/ar-local.c +++ b/net/rxrpc/ar-local.c @@ -13,16 +13,22 @@ #include #include #include +#include +#include #include #include +#include #include "ar-internal.h" +static const char rxrpc_version_string[65] = "linux-" UTS_RELEASE " AF_RXRPC"; + static LIST_HEAD(rxrpc_locals); DEFINE_RWLOCK(rxrpc_local_lock); static DECLARE_RWSEM(rxrpc_local_sem); static DECLARE_WAIT_QUEUE_HEAD(rxrpc_local_wq); static void rxrpc_destroy_local(struct work_struct *work); +static void rxrpc_process_local_events(struct work_struct *work); /* * allocate a new local @@ -37,11 +43,13 @@ struct rxrpc_local *rxrpc_alloc_local(struct sockaddr_rxrpc *srx) INIT_WORK(&local->destroyer, &rxrpc_destroy_local); INIT_WORK(&local->acceptor, &rxrpc_accept_incoming_calls); INIT_WORK(&local->rejecter, &rxrpc_reject_packets); + INIT_WORK(&local->event_processor, &rxrpc_process_local_events); INIT_LIST_HEAD(&local->services); INIT_LIST_HEAD(&local->link); init_rwsem(&local->defrag_sem); skb_queue_head_init(&local->accept_queue); skb_queue_head_init(&local->reject_queue); + skb_queue_head_init(&local->event_queue); spin_lock_init(&local->lock); rwlock_init(&local->services_lock); atomic_set(&local->usage, 1); @@ -264,10 +272,12 @@ static void rxrpc_destroy_local(struct work_struct *work) ASSERT(list_empty(&local->services)); ASSERT(!work_pending(&local->acceptor)); ASSERT(!work_pending(&local->rejecter)); + ASSERT(!work_pending(&local->event_processor)); /* finish cleaning up the local descriptor */ rxrpc_purge_queue(&local->accept_queue); rxrpc_purge_queue(&local->reject_queue); + rxrpc_purge_queue(&local->event_queue); kernel_sock_shutdown(local->socket, SHUT_RDWR); sock_release(local->socket); @@ -308,3 +318,91 @@ void __exit rxrpc_destroy_all_locals(void) _leave(""); } + +/* + * Reply to a version request + */ +static void rxrpc_send_version_request(struct rxrpc_local *local, + struct rxrpc_header *hdr, + struct sk_buff *skb) +{ + struct sockaddr_in sin; + struct msghdr msg; + struct kvec iov[2]; + size_t len; + int ret; + + _enter(""); + + sin.sin_family = AF_INET; + sin.sin_port = udp_hdr(skb)->source; + sin.sin_addr.s_addr = ip_hdr(skb)->saddr; + + msg.msg_name = &sin; + msg.msg_namelen = sizeof(sin); + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = 0; + + hdr->seq = 0; + hdr->serial = 0; + hdr->type = RXRPC_PACKET_TYPE_VERSION; + hdr->flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED); + hdr->userStatus = 0; + hdr->_rsvd = 0; + + iov[0].iov_base = hdr; + iov[0].iov_len = sizeof(*hdr); + iov[1].iov_base = (char *)rxrpc_version_string; + iov[1].iov_len = sizeof(rxrpc_version_string); + + len = iov[0].iov_len + iov[1].iov_len; + + _proto("Tx VERSION (reply)"); + + ret = kernel_sendmsg(local->socket, &msg, iov, 2, len); + if (ret < 0) + _debug("sendmsg failed: %d", ret); + + _leave(""); +} + +/* + * Process event packets targetted at a local endpoint. + */ +static void rxrpc_process_local_events(struct work_struct *work) +{ + struct rxrpc_local *local = container_of(work, struct rxrpc_local, event_processor); + struct sk_buff *skb; + char v; + + _enter(""); + + atomic_inc(&local->usage); + + while ((skb = skb_dequeue(&local->event_queue))) { + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + kdebug("{%d},{%u}", local->debug_id, sp->hdr.type); + + switch (sp->hdr.type) { + case RXRPC_PACKET_TYPE_VERSION: + if (skb_copy_bits(skb, 0, &v, 1) < 0) + return; + _proto("Rx VERSION { %02x }", v); + if (v == 0) + rxrpc_send_version_request(local, &sp->hdr, skb); + break; + + default: + /* Just ignore anything we don't understand */ + break; + } + + rxrpc_put_local(local); + rxrpc_free_skb(skb); + } + + rxrpc_put_local(local); + _leave(""); +} From 53d5864bc673288f03bb7892a3cffa41a173082b Mon Sep 17 00:00:00 2001 From: Nathaniel Wesley Filardo Date: Thu, 21 Aug 2014 14:10:55 -0400 Subject: [PATCH 12/17] kafs: Add more "unified AFS" error codes This should cover the set emitted by viced and the volume server. Signed-off-by: Nathaniel Wesley Filardo Signed-off-by: David Howells --- fs/afs/misc.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 0dd4dafee10b..91ea1aa0d8b3 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -22,9 +22,12 @@ int afs_abort_to_error(u32 abort_code) { switch (abort_code) { + /* low errno codes inserted into abort namespace */ case 13: return -EACCES; case 27: return -EFBIG; case 30: return -EROFS; + + /* VICE "special error" codes; 101 - 111 */ case VSALVAGE: return -EIO; case VNOVNODE: return -ENOENT; case VNOVOL: return -ENOMEDIUM; @@ -36,11 +39,18 @@ int afs_abort_to_error(u32 abort_code) case VOVERQUOTA: return -EDQUOT; case VBUSY: return -EBUSY; case VMOVED: return -ENXIO; - case 0x2f6df0a: return -EWOULDBLOCK; + + /* Unified AFS error table; ET "uae" == 0x2f6df00 */ + case 0x2f6df00: return -EPERM; + case 0x2f6df01: return -ENOENT; + case 0x2f6df04: return -EIO; + case 0x2f6df0a: return -EAGAIN; + case 0x2f6df0b: return -ENOMEM; case 0x2f6df0c: return -EACCES; case 0x2f6df0f: return -EBUSY; case 0x2f6df10: return -EEXIST; case 0x2f6df11: return -EXDEV; + case 0x2f6df12: return -ENODEV; case 0x2f6df13: return -ENOTDIR; case 0x2f6df14: return -EISDIR; case 0x2f6df15: return -EINVAL; @@ -54,8 +64,12 @@ int afs_abort_to_error(u32 abort_code) case 0x2f6df23: return -ENAMETOOLONG; case 0x2f6df24: return -ENOLCK; case 0x2f6df26: return -ENOTEMPTY; + case 0x2f6df28: return -EWOULDBLOCK; + case 0x2f6df69: return -ENOTCONN; + case 0x2f6df6c: return -ETIMEDOUT; case 0x2f6df78: return -EDQUOT; + /* RXKAD abort codes; from include/rxrpc/packet.h. ET "RXK" == 0x1260B00 */ case RXKADINCONSISTENCY: return -EPROTO; case RXKADPACKETSHORT: return -EPROTO; case RXKADLEVELFAIL: return -EKEYREJECTED; From 602bd0e90e14c0b50246b361290dbbbe551ada98 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 21 Mar 2015 19:12:32 -0400 Subject: [PATCH 13/17] net: switch sendto() and recvfrom() to import_single_range() Signed-off-by: Al Viro --- net/socket.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/net/socket.c b/net/socket.c index 989b1ae32afa..46f0e1d752b3 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1650,18 +1650,14 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, struct iovec iov; int fput_needed; - if (len > INT_MAX) - len = INT_MAX; - if (unlikely(!access_ok(VERIFY_READ, buff, len))) - return -EFAULT; + err = import_single_range(WRITE, buff, len, &iov, &msg.msg_iter); + if (unlikely(err)) + return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; - iov.iov_base = buff; - iov.iov_len = len; msg.msg_name = NULL; - iov_iter_init(&msg.msg_iter, WRITE, &iov, 1, len); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; @@ -1675,7 +1671,7 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; - err = sock_sendmsg(sock, &msg, len); + err = sock_sendmsg(sock, &msg, iov_iter_count(&msg.msg_iter)); out_put: fput_light(sock->file, fput_needed); @@ -1710,26 +1706,22 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, int err, err2; int fput_needed; - if (size > INT_MAX) - size = INT_MAX; - if (unlikely(!access_ok(VERIFY_WRITE, ubuf, size))) - return -EFAULT; + err = import_single_range(READ, ubuf, size, &iov, &msg.msg_iter); + if (unlikely(err)) + return err; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; msg.msg_control = NULL; msg.msg_controllen = 0; - iov.iov_len = size; - iov.iov_base = ubuf; - iov_iter_init(&msg.msg_iter, READ, &iov, 1, size); /* Save some cycles and don't copy the address if not needed */ msg.msg_name = addr ? (struct sockaddr *)&address : NULL; /* We assume all kernel code knows the size of sockaddr_storage */ msg.msg_namelen = 0; if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; - err = sock_recvmsg(sock, &msg, size, flags); + err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags); if (err >= 0 && addr != NULL) { err2 = move_addr_to_user(&address, From da18428498fb24438a23d982259461fe22bc1f46 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 21 Mar 2015 19:29:06 -0400 Subject: [PATCH 14/17] net: switch importing msghdr from userland to {compat_,}import_iovec() Signed-off-by: Al Viro --- include/net/compat.h | 2 +- net/compat.c | 18 +++++++----------- net/socket.c | 31 ++++++++++++------------------- 3 files changed, 20 insertions(+), 31 deletions(-) diff --git a/include/net/compat.h b/include/net/compat.h index 42a9c8431177..48103cf94e97 100644 --- a/include/net/compat.h +++ b/include/net/compat.h @@ -40,7 +40,7 @@ int compat_sock_get_timestampns(struct sock *, struct timespec __user *); #define compat_mmsghdr mmsghdr #endif /* defined(CONFIG_COMPAT) */ -ssize_t get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *, +int get_compat_msghdr(struct msghdr *, struct compat_msghdr __user *, struct sockaddr __user **, struct iovec **); asmlinkage long compat_sys_sendmsg(int, struct compat_msghdr __user *, unsigned int); diff --git a/net/compat.c b/net/compat.c index c4b6b0f43d5d..5cfd26a0006f 100644 --- a/net/compat.c +++ b/net/compat.c @@ -31,10 +31,10 @@ #include #include -ssize_t get_compat_msghdr(struct msghdr *kmsg, - struct compat_msghdr __user *umsg, - struct sockaddr __user **save_addr, - struct iovec **iov) +int get_compat_msghdr(struct msghdr *kmsg, + struct compat_msghdr __user *umsg, + struct sockaddr __user **save_addr, + struct iovec **iov) { compat_uptr_t uaddr, uiov, tmp3; compat_size_t nr_segs; @@ -81,13 +81,9 @@ ssize_t get_compat_msghdr(struct msghdr *kmsg, kmsg->msg_iocb = NULL; - err = compat_rw_copy_check_uvector(save_addr ? READ : WRITE, - compat_ptr(uiov), nr_segs, - UIO_FASTIOV, *iov, iov); - if (err >= 0) - iov_iter_init(&kmsg->msg_iter, save_addr ? READ : WRITE, - *iov, nr_segs, err); - return err; + return compat_import_iovec(save_addr ? READ : WRITE, + compat_ptr(uiov), nr_segs, + UIO_FASTIOV, iov, &kmsg->msg_iter); } /* Bleech... */ diff --git a/net/socket.c b/net/socket.c index 46f0e1d752b3..e5669cee0759 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1841,10 +1841,10 @@ struct used_address { unsigned int name_len; }; -static ssize_t copy_msghdr_from_user(struct msghdr *kmsg, - struct user_msghdr __user *umsg, - struct sockaddr __user **save_addr, - struct iovec **iov) +static int copy_msghdr_from_user(struct msghdr *kmsg, + struct user_msghdr __user *umsg, + struct sockaddr __user **save_addr, + struct iovec **iov) { struct sockaddr __user *uaddr; struct iovec __user *uiov; @@ -1890,13 +1890,8 @@ static ssize_t copy_msghdr_from_user(struct msghdr *kmsg, kmsg->msg_iocb = NULL; - err = rw_copy_check_uvector(save_addr ? READ : WRITE, - uiov, nr_segs, - UIO_FASTIOV, *iov, iov); - if (err >= 0) - iov_iter_init(&kmsg->msg_iter, save_addr ? READ : WRITE, - *iov, nr_segs, err); - return err; + return import_iovec(save_addr ? READ : WRITE, uiov, nr_segs, + UIO_FASTIOV, iov, &kmsg->msg_iter); } static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, @@ -1921,8 +1916,8 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, else err = copy_msghdr_from_user(msg_sys, msg, NULL, &iov); if (err < 0) - goto out_freeiov; - total_len = err; + return err; + total_len = iov_iter_count(&msg_sys->msg_iter); err = -ENOBUFS; @@ -1988,8 +1983,7 @@ out_freectl: if (ctl_buf != ctl) sock_kfree_s(sock->sk, ctl_buf, ctl_len); out_freeiov: - if (iov != iovstack) - kfree(iov); + kfree(iov); return err; } @@ -2114,8 +2108,8 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, else err = copy_msghdr_from_user(msg_sys, msg, &uaddr, &iov); if (err < 0) - goto out_freeiov; - total_len = err; + return err; + total_len = iov_iter_count(&msg_sys->msg_iter); cmsg_ptr = (unsigned long)msg_sys->msg_control; msg_sys->msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT); @@ -2153,8 +2147,7 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, err = len; out_freeiov: - if (iov != iovstack) - kfree(iov); + kfree(iov); return err; } From 6aa248145ab0b1809de2411cf129ec1fc315a46f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 21 Mar 2015 19:56:16 -0400 Subject: [PATCH 15/17] switch kernel_sendmsg() and kernel_recvmsg() to iov_iter_kvec() For kernel_sendmsg() that eliminates the need to play with setfs(); for kernel_recvmsg() it does *not* - a couple of callers are using it with non-NULL ->msg_control, which would be treated as userland address on recvmsg side of things. In all cases we are really setting a kvec-backed iov_iter, though. Signed-off-by: Al Viro --- net/socket.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/net/socket.c b/net/socket.c index e5669cee0759..b6ceeda65214 100644 --- a/net/socket.c +++ b/net/socket.c @@ -627,18 +627,8 @@ EXPORT_SYMBOL(sock_sendmsg); int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { - mm_segment_t oldfs = get_fs(); - int result; - - set_fs(KERNEL_DS); - /* - * the following is safe, since for compiler definitions of kvec and - * iovec are identical, yielding the same in-core layout and alignment - */ - iov_iter_init(&msg->msg_iter, WRITE, (struct iovec *)vec, num, size); - result = sock_sendmsg(sock, msg, size); - set_fs(oldfs); - return result; + iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size); + return sock_sendmsg(sock, msg, size); } EXPORT_SYMBOL(kernel_sendmsg); @@ -755,12 +745,8 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, mm_segment_t oldfs = get_fs(); int result; + iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, vec, num, size); set_fs(KERNEL_DS); - /* - * the following is safe, since for compiler definitions of kvec and - * iovec are identical, yielding the same in-core layout and alignment - */ - iov_iter_init(&msg->msg_iter, READ, (struct iovec *)vec, num, size); result = sock_recvmsg(sock, msg, size, flags); set_fs(oldfs); return result; From d8725c86aebaf3516e220760aaf5fefc73825188 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 11 Dec 2014 00:02:50 -0500 Subject: [PATCH 16/17] get rid of the size argument of sock_sendmsg() it's equal to iov_iter_count(&msg->msg_iter) in all cases Signed-off-by: Al Viro --- include/linux/net.h | 2 +- net/socket.c | 27 ++++++++++++++------------- net/sunrpc/svcsock.c | 2 +- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/include/linux/net.h b/include/linux/net.h index e74114bcca68..738ea48be889 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -211,7 +211,7 @@ int sock_create(int family, int type, int proto, struct socket **res); int sock_create_kern(int family, int type, int proto, struct socket **res); int sock_create_lite(int family, int type, int proto, struct socket **res); void sock_release(struct socket *sock); -int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len); +int sock_sendmsg(struct socket *sock, struct msghdr *msg); int sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname); diff --git a/net/socket.c b/net/socket.c index b6ceeda65214..21676e469b13 100644 --- a/net/socket.c +++ b/net/socket.c @@ -610,17 +610,19 @@ void __sock_tx_timestamp(const struct sock *sk, __u8 *tx_flags) } EXPORT_SYMBOL(__sock_tx_timestamp); -static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg, - size_t size) +static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { - return sock->ops->sendmsg(sock, msg, size); + int ret = sock->ops->sendmsg(sock, msg, iov_iter_count(&msg->msg_iter)); + BUG_ON(ret == -EIOCBQUEUED); + return ret; } -int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +int sock_sendmsg(struct socket *sock, struct msghdr *msg) { - int err = security_socket_sendmsg(sock, msg, size); + int err = security_socket_sendmsg(sock, msg, + iov_iter_count(&msg->msg_iter)); - return err ?: sock_sendmsg_nosec(sock, msg, size); + return err ?: sock_sendmsg_nosec(sock, msg); } EXPORT_SYMBOL(sock_sendmsg); @@ -628,7 +630,7 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC, vec, num, size); - return sock_sendmsg(sock, msg, size); + return sock_sendmsg(sock, msg); } EXPORT_SYMBOL(kernel_sendmsg); @@ -819,7 +821,7 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; - res = sock_sendmsg(sock, &msg, iov_iter_count(from)); + res = sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; } @@ -1657,7 +1659,7 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, if (sock->file->f_flags & O_NONBLOCK) flags |= MSG_DONTWAIT; msg.msg_flags = flags; - err = sock_sendmsg(sock, &msg, iov_iter_count(&msg.msg_iter)); + err = sock_sendmsg(sock, &msg); out_put: fput_light(sock->file, fput_needed); @@ -1892,7 +1894,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, __attribute__ ((aligned(sizeof(__kernel_size_t)))); /* 20 is size of ipv6_pktinfo */ unsigned char *ctl_buf = ctl; - int ctl_len, total_len; + int ctl_len; ssize_t err; msg_sys->msg_name = &address; @@ -1903,7 +1905,6 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, err = copy_msghdr_from_user(msg_sys, msg, NULL, &iov); if (err < 0) return err; - total_len = iov_iter_count(&msg_sys->msg_iter); err = -ENOBUFS; @@ -1950,10 +1951,10 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, used_address->name_len == msg_sys->msg_namelen && !memcmp(&used_address->name, msg_sys->msg_name, used_address->name_len)) { - err = sock_sendmsg_nosec(sock, msg_sys, total_len); + err = sock_sendmsg_nosec(sock, msg_sys); goto out_freectl; } - err = sock_sendmsg(sock, msg_sys, total_len); + err = sock_sendmsg(sock, msg_sys); /* * If this is sendmmsg() and sending to current destination address was * successful, remember it. diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index cc331b6cf573..0c8120229a03 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -257,7 +257,7 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) svc_set_cmsg_data(rqstp, cmh); - if (sock_sendmsg(sock, &msg, 0) < 0) + if (sock_sendmsg(sock, &msg) < 0) goto out; } From 01e97e6517053d7c0b9af5248e944a9209909cf5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 15 Dec 2014 21:39:31 -0500 Subject: [PATCH 17/17] new helper: msg_data_left() convert open-coded instances Signed-off-by: Al Viro --- crypto/algif_hash.c | 4 ++-- crypto/algif_skcipher.c | 4 ++-- drivers/vhost/net.c | 4 ++-- include/linux/socket.h | 5 +++++ net/core/datagram.c | 2 +- net/ipv4/tcp.c | 8 ++++---- net/rxrpc/ar-output.c | 19 +++++++++---------- net/socket.c | 4 ++-- 8 files changed, 27 insertions(+), 23 deletions(-) diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index 0a465e0f3012..1396ad0787fc 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -56,8 +56,8 @@ static int hash_sendmsg(struct socket *sock, struct msghdr *msg, ctx->more = 0; - while (iov_iter_count(&msg->msg_iter)) { - int len = iov_iter_count(&msg->msg_iter); + while (msg_data_left(msg)) { + int len = msg_data_left(msg); if (len > limit) len = limit; diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index 8f903b6df299..945075292bc9 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -641,7 +641,7 @@ static int skcipher_recvmsg_sync(struct socket *sock, struct msghdr *msg, long copied = 0; lock_sock(sk); - while (iov_iter_count(&msg->msg_iter)) { + while (msg_data_left(msg)) { sgl = list_first_entry(&ctx->tsgl, struct skcipher_sg_list, list); sg = sgl->sg; @@ -655,7 +655,7 @@ static int skcipher_recvmsg_sync(struct socket *sock, struct msghdr *msg, goto unlock; } - used = min_t(unsigned long, ctx->used, iov_iter_count(&msg->msg_iter)); + used = min_t(unsigned long, ctx->used, msg_data_left(msg)); used = af_alg_make_sg(&ctx->rsgl, &msg->msg_iter, used); err = used; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 18f05bff8826..7d137a43cc86 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -357,13 +357,13 @@ static void handle_tx(struct vhost_net *net) iov_iter_init(&msg.msg_iter, WRITE, vq->iov, out, len); iov_iter_advance(&msg.msg_iter, hdr_size); /* Sanity check */ - if (!iov_iter_count(&msg.msg_iter)) { + if (!msg_data_left(&msg)) { vq_err(vq, "Unexpected header len for TX: " "%zd expected %zd\n", len, hdr_size); break; } - len = iov_iter_count(&msg.msg_iter); + len = msg_data_left(&msg); zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN && (nvq->upend_idx + 1) % UIO_MAXIOV != diff --git a/include/linux/socket.h b/include/linux/socket.h index c9852ef7e317..5bf59c8493b7 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -139,6 +139,11 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); } +static inline size_t msg_data_left(struct msghdr *msg) +{ + return iov_iter_count(&msg->msg_iter); +} + /* "Socket"-level control message types: */ #define SCM_RIGHTS 0x01 /* rw: access rights (array of int) */ diff --git a/net/core/datagram.c b/net/core/datagram.c index df493d68330c..b80fb91bb3f7 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -673,7 +673,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, if (!chunk) return 0; - if (iov_iter_count(&msg->msg_iter) < chunk) { + if (msg_data_left(msg) < chunk) { if (__skb_checksum_complete(skb)) goto csum_error; if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 094a6822c71d..18e3a12eb1b2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1119,7 +1119,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) sg = !!(sk->sk_route_caps & NETIF_F_SG); - while (iov_iter_count(&msg->msg_iter)) { + while (msg_data_left(msg)) { int copy = 0; int max = size_goal; @@ -1163,8 +1163,8 @@ new_segment: } /* Try to append data to the end of skb. */ - if (copy > iov_iter_count(&msg->msg_iter)) - copy = iov_iter_count(&msg->msg_iter); + if (copy > msg_data_left(msg)) + copy = msg_data_left(msg); /* Where to copy to? */ if (skb_availroom(skb) > 0) { @@ -1221,7 +1221,7 @@ new_segment: tcp_skb_pcount_set(skb, 0); copied += copy; - if (!iov_iter_count(&msg->msg_iter)) { + if (!msg_data_left(msg)) { tcp_tx_timestamp(sk, skb); goto out; } diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c index 7a31a3958364..c0042807bfc6 100644 --- a/net/rxrpc/ar-output.c +++ b/net/rxrpc/ar-output.c @@ -564,8 +564,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, max &= ~(call->conn->size_align - 1UL); chunk = max; - if (chunk > iov_iter_count(&msg->msg_iter) && !more) - chunk = iov_iter_count(&msg->msg_iter); + if (chunk > msg_data_left(msg) && !more) + chunk = msg_data_left(msg); space = chunk + call->conn->size_align; space &= ~(call->conn->size_align - 1UL); @@ -608,11 +608,11 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, sp = rxrpc_skb(skb); /* append next segment of data to the current buffer */ - if (iov_iter_count(&msg->msg_iter) > 0) { + if (msg_data_left(msg) > 0) { int copy = skb_tailroom(skb); ASSERTCMP(copy, >, 0); - if (copy > iov_iter_count(&msg->msg_iter)) - copy = iov_iter_count(&msg->msg_iter); + if (copy > msg_data_left(msg)) + copy = msg_data_left(msg); if (copy > sp->remain) copy = sp->remain; @@ -633,7 +633,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, /* add the packet to the send queue if it's now full */ if (sp->remain <= 0 || - (iov_iter_count(&msg->msg_iter) == 0 && !more)) { + (msg_data_left(msg) == 0 && !more)) { struct rxrpc_connection *conn = call->conn; uint32_t seq; size_t pad; @@ -663,7 +663,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, sp->hdr.serviceId = conn->service_id; sp->hdr.flags = conn->out_clientflag; - if (iov_iter_count(&msg->msg_iter) == 0 && !more) + if (msg_data_left(msg) == 0 && !more) sp->hdr.flags |= RXRPC_LAST_PACKET; else if (CIRC_SPACE(call->acks_head, call->acks_tail, call->acks_winsz) > 1) @@ -679,11 +679,10 @@ static int rxrpc_send_data(struct rxrpc_sock *rx, memcpy(skb->head, &sp->hdr, sizeof(struct rxrpc_header)); - rxrpc_queue_packet(call, skb, - iov_iter_count(&msg->msg_iter) == 0 && !more); + rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more); skb = NULL; } - } while (iov_iter_count(&msg->msg_iter) > 0); + } while (msg_data_left(msg) > 0); success: ret = copied; diff --git a/net/socket.c b/net/socket.c index 21676e469b13..5b0126234606 100644 --- a/net/socket.c +++ b/net/socket.c @@ -612,7 +612,7 @@ EXPORT_SYMBOL(__sock_tx_timestamp); static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { - int ret = sock->ops->sendmsg(sock, msg, iov_iter_count(&msg->msg_iter)); + int ret = sock->ops->sendmsg(sock, msg, msg_data_left(msg)); BUG_ON(ret == -EIOCBQUEUED); return ret; } @@ -620,7 +620,7 @@ static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) int sock_sendmsg(struct socket *sock, struct msghdr *msg) { int err = security_socket_sendmsg(sock, msg, - iov_iter_count(&msg->msg_iter)); + msg_data_left(msg)); return err ?: sock_sendmsg_nosec(sock, msg); }