for-5.20/io_uring-zerocopy-send-2022-07-29
-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmLkm/MQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpoaXD/9Nevo4KQmlG83ZcZfu2d51VlGtt6/Dl7LL pr07RfnRFJcjeCPCwXCXmu6rrlY+inpfEWv9iCR/ImoeESOJCzm0dN/nlffO/zT1 E0h5AlEoDv2bYrCnVkbfvxL722TZqGeLiDE4YY1jVbuUfs3TDmLQzfGbORK+Zw4y wPEMDZP1yWHoyeHUGWFasu6dpWiAwsZ4sTX0J631YwIBDNWKZqtienIiY15rK4dz GioBea6voe8Fos0VEhCBOKXMmV9mG4yVOPeaDbTWTRfuzGNF8b7t2vg7mz+PrbBY M8h1oEt+/+FnsCIZqfaEUzqHX6quv46OVtq/F5L3yNz/5QEsnqfv08ZFwD3sXdgZ /RFxXamfcn/LoxzZ9eLu3MeyzpXp6frxBcgTNGc3q2TlIwXr1WsIx2N4PxZh00GM ssW/ulaOZvZmOmDlbdeSC7sp3R1JmHO4qVlHowr58ce8pkishNTwlZZGr0sHyeNq /Wkd9NQEQEFD6AIzZ/Mz9CsmzHeHYpy6GhicFrcLuU4YF/fnQ6T4hTjlIlucGv/S IeqoAHrurCB0/p1ml6VfJ58xUWXNCCCkKC5+xu8Vm6/RgMlIw5KkzvVEBfflnomB wVJLYsLw41gnlqqpwISR39I7cDV+s6xC5P8YAA/NLz692HDIUrRX14dlbZuXIgbc ROeHB2N5+g== =vSwm -----END PGP SIGNATURE----- Merge tag 'for-5.20/io_uring-zerocopy-send-2022-07-29' of git://git.kernel.dk/linux-block Pull io_uring zerocopy support from Jens Axboe: "This adds support for efficient support for zerocopy sends through io_uring. Both ipv4 and ipv6 is supported, as well as both TCP and UDP. The core network changes to support this is in a stable branch from Jakub that both io_uring and net-next has pulled in, and the io_uring changes are layered on top of that. All of the work has been done by Pavel" * tag 'for-5.20/io_uring-zerocopy-send-2022-07-29' of git://git.kernel.dk/linux-block: (34 commits) io_uring: notification completion optimisation io_uring: export req alloc from core io_uring/net: use unsigned for flags io_uring/net: make page accounting more consistent io_uring/net: checks errors of zc mem accounting io_uring/net: improve io_get_notif_slot types selftests/io_uring: test zerocopy send io_uring: enable managed frags with register buffers io_uring: add zc notification flush requests io_uring: rename IORING_OP_FILES_UPDATE io_uring: flush notifiers after sendzc io_uring: sendzc with fixed buffers io_uring: allow to pass addr into sendzc io_uring: account locked pages for non-fixed zc io_uring: wire send zc request type io_uring: add notification slot registration io_uring: add rsrc referencing for notifiers io_uring: complete notifiers in tw io_uring: cache struct io_notif io_uring: add zc notification infrastructure ...
This commit is contained in:
Коммит
42df1cbf6a
|
@ -4,6 +4,7 @@
|
|||
#include <linux/blkdev.h>
|
||||
#include <linux/task_work.h>
|
||||
#include <linux/bitmap.h>
|
||||
#include <linux/llist.h>
|
||||
#include <uapi/linux/io_uring.h>
|
||||
|
||||
struct io_wq_work_node {
|
||||
|
@ -33,6 +34,9 @@ struct io_file_table {
|
|||
unsigned int alloc_hint;
|
||||
};
|
||||
|
||||
struct io_notif;
|
||||
struct io_notif_slot;
|
||||
|
||||
struct io_hash_bucket {
|
||||
spinlock_t lock;
|
||||
struct hlist_head list;
|
||||
|
@ -43,6 +47,30 @@ struct io_hash_table {
|
|||
unsigned hash_bits;
|
||||
};
|
||||
|
||||
/*
|
||||
* Arbitrary limit, can be raised if need be
|
||||
*/
|
||||
#define IO_RINGFD_REG_MAX 16
|
||||
|
||||
struct io_uring_task {
|
||||
/* submission side */
|
||||
int cached_refs;
|
||||
const struct io_ring_ctx *last;
|
||||
struct io_wq *io_wq;
|
||||
struct file *registered_rings[IO_RINGFD_REG_MAX];
|
||||
|
||||
struct xarray xa;
|
||||
struct wait_queue_head wait;
|
||||
atomic_t in_idle;
|
||||
atomic_t inflight_tracked;
|
||||
struct percpu_counter inflight;
|
||||
|
||||
struct { /* task_work */
|
||||
struct llist_head task_list;
|
||||
struct callback_head task_work;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
};
|
||||
|
||||
struct io_uring {
|
||||
u32 head ____cacheline_aligned_in_smp;
|
||||
u32 tail ____cacheline_aligned_in_smp;
|
||||
|
@ -212,6 +240,8 @@ struct io_ring_ctx {
|
|||
unsigned nr_user_files;
|
||||
unsigned nr_user_bufs;
|
||||
struct io_mapped_ubuf **user_bufs;
|
||||
struct io_notif_slot *notif_slots;
|
||||
unsigned nr_notif_slots;
|
||||
|
||||
struct io_submit_state submit_state;
|
||||
|
||||
|
|
|
@ -686,10 +686,18 @@ enum {
|
|||
* charged to the kernel memory.
|
||||
*/
|
||||
SKBFL_PURE_ZEROCOPY = BIT(2),
|
||||
|
||||
SKBFL_DONT_ORPHAN = BIT(3),
|
||||
|
||||
/* page references are managed by the ubuf_info, so it's safe to
|
||||
* use frags only up until ubuf_info is released
|
||||
*/
|
||||
SKBFL_MANAGED_FRAG_REFS = BIT(4),
|
||||
};
|
||||
|
||||
#define SKBFL_ZEROCOPY_FRAG (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
|
||||
#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY)
|
||||
#define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \
|
||||
SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS)
|
||||
|
||||
/*
|
||||
* The callback notifies userspace to release buffers when skb DMA is done in
|
||||
|
@ -1773,13 +1781,14 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);
|
|||
void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
|
||||
bool success);
|
||||
|
||||
int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
|
||||
struct iov_iter *from, size_t length);
|
||||
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
|
||||
struct sk_buff *skb, struct iov_iter *from,
|
||||
size_t length);
|
||||
|
||||
static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
|
||||
struct msghdr *msg, int len)
|
||||
{
|
||||
return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
|
||||
return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
|
||||
}
|
||||
|
||||
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
|
||||
|
@ -1806,6 +1815,11 @@ static inline bool skb_zcopy_pure(const struct sk_buff *skb)
|
|||
return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
|
||||
}
|
||||
|
||||
static inline bool skb_zcopy_managed(const struct sk_buff *skb)
|
||||
{
|
||||
return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS;
|
||||
}
|
||||
|
||||
static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
|
||||
const struct sk_buff *skb2)
|
||||
{
|
||||
|
@ -1880,6 +1894,14 @@ static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
|
|||
}
|
||||
}
|
||||
|
||||
void __skb_zcopy_downgrade_managed(struct sk_buff *skb);
|
||||
|
||||
static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb)
|
||||
{
|
||||
if (unlikely(skb_zcopy_managed(skb)))
|
||||
__skb_zcopy_downgrade_managed(skb);
|
||||
}
|
||||
|
||||
static inline void skb_mark_not_on_list(struct sk_buff *skb)
|
||||
{
|
||||
skb->next = NULL;
|
||||
|
@ -2528,6 +2550,22 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
|
|||
return skb_headlen(skb) + __skb_pagelen(skb);
|
||||
}
|
||||
|
||||
static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo,
|
||||
int i, struct page *page,
|
||||
int off, int size)
|
||||
{
|
||||
skb_frag_t *frag = &shinfo->frags[i];
|
||||
|
||||
/*
|
||||
* Propagate page pfmemalloc to the skb if we can. The problem is
|
||||
* that not all callers have unique ownership of the page but rely
|
||||
* on page_is_pfmemalloc doing the right thing(tm).
|
||||
*/
|
||||
frag->bv_page = page;
|
||||
frag->bv_offset = off;
|
||||
skb_frag_size_set(frag, size);
|
||||
}
|
||||
|
||||
/**
|
||||
* __skb_fill_page_desc - initialise a paged fragment in an skb
|
||||
* @skb: buffer containing fragment to be initialised
|
||||
|
@ -2544,17 +2582,7 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb)
|
|||
static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
|
||||
struct page *page, int off, int size)
|
||||
{
|
||||
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
|
||||
|
||||
/*
|
||||
* Propagate page pfmemalloc to the skb if we can. The problem is
|
||||
* that not all callers have unique ownership of the page but rely
|
||||
* on page_is_pfmemalloc doing the right thing(tm).
|
||||
*/
|
||||
frag->bv_page = page;
|
||||
frag->bv_offset = off;
|
||||
skb_frag_size_set(frag, size);
|
||||
|
||||
__skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size);
|
||||
page = compound_head(page);
|
||||
if (page_is_pfmemalloc(page))
|
||||
skb->pfmemalloc = true;
|
||||
|
@ -3182,8 +3210,7 @@ static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
|
|||
{
|
||||
if (likely(!skb_zcopy(skb)))
|
||||
return 0;
|
||||
if (!skb_zcopy_is_nouarg(skb) &&
|
||||
skb_uarg(skb)->callback == msg_zerocopy_callback)
|
||||
if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN)
|
||||
return 0;
|
||||
return skb_copy_ubufs(skb, gfp_mask);
|
||||
}
|
||||
|
@ -3496,7 +3523,10 @@ static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
|
|||
*/
|
||||
static inline void skb_frag_unref(struct sk_buff *skb, int f)
|
||||
{
|
||||
__skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
|
||||
struct skb_shared_info *shinfo = skb_shinfo(skb);
|
||||
|
||||
if (!skb_zcopy_managed(skb))
|
||||
__skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -14,6 +14,8 @@ struct file;
|
|||
struct pid;
|
||||
struct cred;
|
||||
struct socket;
|
||||
struct sock;
|
||||
struct sk_buff;
|
||||
|
||||
#define __sockaddr_check_size(size) \
|
||||
BUILD_BUG_ON(((size) > sizeof(struct __kernel_sockaddr_storage)))
|
||||
|
@ -69,6 +71,9 @@ struct msghdr {
|
|||
unsigned int msg_flags; /* flags on received message */
|
||||
__kernel_size_t msg_controllen; /* ancillary data buffer length */
|
||||
struct kiocb *msg_iocb; /* ptr to iocb for async requests */
|
||||
struct ubuf_info *msg_ubuf;
|
||||
int (*sg_from_iter)(struct sock *sk, struct sk_buff *skb,
|
||||
struct iov_iter *from, size_t length);
|
||||
};
|
||||
|
||||
struct user_msghdr {
|
||||
|
|
|
@ -66,6 +66,10 @@ struct io_uring_sqe {
|
|||
union {
|
||||
__s32 splice_fd_in;
|
||||
__u32 file_index;
|
||||
struct {
|
||||
__u16 notification_idx;
|
||||
__u16 addr_len;
|
||||
};
|
||||
};
|
||||
union {
|
||||
struct {
|
||||
|
@ -170,7 +174,8 @@ enum io_uring_op {
|
|||
IORING_OP_FALLOCATE,
|
||||
IORING_OP_OPENAT,
|
||||
IORING_OP_CLOSE,
|
||||
IORING_OP_FILES_UPDATE,
|
||||
IORING_OP_RSRC_UPDATE,
|
||||
IORING_OP_FILES_UPDATE = IORING_OP_RSRC_UPDATE,
|
||||
IORING_OP_STATX,
|
||||
IORING_OP_READ,
|
||||
IORING_OP_WRITE,
|
||||
|
@ -197,6 +202,7 @@ enum io_uring_op {
|
|||
IORING_OP_GETXATTR,
|
||||
IORING_OP_SOCKET,
|
||||
IORING_OP_URING_CMD,
|
||||
IORING_OP_SENDZC_NOTIF,
|
||||
|
||||
/* this goes last, obviously */
|
||||
IORING_OP_LAST,
|
||||
|
@ -218,6 +224,7 @@ enum io_uring_op {
|
|||
#define IORING_TIMEOUT_ETIME_SUCCESS (1U << 5)
|
||||
#define IORING_TIMEOUT_CLOCK_MASK (IORING_TIMEOUT_BOOTTIME | IORING_TIMEOUT_REALTIME)
|
||||
#define IORING_TIMEOUT_UPDATE_MASK (IORING_TIMEOUT_UPDATE | IORING_LINK_TIMEOUT_UPDATE)
|
||||
|
||||
/*
|
||||
* sqe->splice_flags
|
||||
* extends splice(2) flags
|
||||
|
@ -267,15 +274,32 @@ enum io_uring_op {
|
|||
* IORING_RECV_MULTISHOT Multishot recv. Sets IORING_CQE_F_MORE if
|
||||
* the handler will continue to report
|
||||
* CQEs on behalf of the same SQE.
|
||||
*
|
||||
* IORING_RECVSEND_FIXED_BUF Use registered buffers, the index is stored in
|
||||
* the buf_index field.
|
||||
*
|
||||
* IORING_RECVSEND_NOTIF_FLUSH Flush a notification after a successful
|
||||
* successful. Only for zerocopy sends.
|
||||
*/
|
||||
#define IORING_RECVSEND_POLL_FIRST (1U << 0)
|
||||
#define IORING_RECV_MULTISHOT (1U << 1)
|
||||
#define IORING_RECV_MULTISHOT (1U << 1)
|
||||
#define IORING_RECVSEND_FIXED_BUF (1U << 2)
|
||||
#define IORING_RECVSEND_NOTIF_FLUSH (1U << 3)
|
||||
|
||||
/*
|
||||
* accept flags stored in sqe->ioprio
|
||||
*/
|
||||
#define IORING_ACCEPT_MULTISHOT (1U << 0)
|
||||
|
||||
|
||||
/*
|
||||
* IORING_OP_RSRC_UPDATE flags
|
||||
*/
|
||||
enum {
|
||||
IORING_RSRC_UPDATE_FILES,
|
||||
IORING_RSRC_UPDATE_NOTIF,
|
||||
};
|
||||
|
||||
/*
|
||||
* IORING_OP_MSG_RING command types, stored in sqe->addr
|
||||
*/
|
||||
|
@ -457,6 +481,10 @@ enum {
|
|||
/* register a range of fixed file slots for automatic slot allocation */
|
||||
IORING_REGISTER_FILE_ALLOC_RANGE = 25,
|
||||
|
||||
/* zerocopy notification API */
|
||||
IORING_REGISTER_NOTIFIERS = 26,
|
||||
IORING_UNREGISTER_NOTIFIERS = 27,
|
||||
|
||||
/* this goes last */
|
||||
IORING_REGISTER_LAST
|
||||
};
|
||||
|
@ -503,6 +531,19 @@ struct io_uring_rsrc_update2 {
|
|||
__u32 resv2;
|
||||
};
|
||||
|
||||
struct io_uring_notification_slot {
|
||||
__u64 tag;
|
||||
__u64 resv[3];
|
||||
};
|
||||
|
||||
struct io_uring_notification_register {
|
||||
__u32 nr_slots;
|
||||
__u32 resv;
|
||||
__u64 resv2;
|
||||
__u64 data;
|
||||
__u64 resv3;
|
||||
};
|
||||
|
||||
/* Skip updating fd indexes set to this value in the fd table */
|
||||
#define IORING_REGISTER_FILES_SKIP (-2)
|
||||
|
||||
|
|
|
@ -7,5 +7,5 @@ obj-$(CONFIG_IO_URING) += io_uring.o xattr.o nop.o fs.o splice.o \
|
|||
openclose.o uring_cmd.o epoll.o \
|
||||
statx.o net.o msg_ring.o timeout.o \
|
||||
sqpoll.o fdinfo.o tctx.o poll.o \
|
||||
cancel.o kbuf.o rsrc.o rw.o opdef.o
|
||||
cancel.o kbuf.o rsrc.o rw.o opdef.o notif.o
|
||||
obj-$(CONFIG_IO_WQ) += io-wq.o
|
||||
|
|
|
@ -90,6 +90,7 @@
|
|||
#include "rsrc.h"
|
||||
#include "cancel.h"
|
||||
#include "net.h"
|
||||
#include "notif.h"
|
||||
|
||||
#include "timeout.h"
|
||||
#include "poll.h"
|
||||
|
@ -608,7 +609,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static void __io_put_task(struct task_struct *task, int nr)
|
||||
void __io_put_task(struct task_struct *task, int nr)
|
||||
{
|
||||
struct io_uring_task *tctx = task->io_uring;
|
||||
|
||||
|
@ -618,16 +619,7 @@ static void __io_put_task(struct task_struct *task, int nr)
|
|||
put_task_struct_many(task, nr);
|
||||
}
|
||||
|
||||
/* must to be called somewhat shortly after putting a request */
|
||||
static inline void io_put_task(struct task_struct *task, int nr)
|
||||
{
|
||||
if (likely(task == current))
|
||||
task->io_uring->cached_refs += nr;
|
||||
else
|
||||
__io_put_task(task, nr);
|
||||
}
|
||||
|
||||
static void io_task_refs_refill(struct io_uring_task *tctx)
|
||||
void io_task_refs_refill(struct io_uring_task *tctx)
|
||||
{
|
||||
unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR;
|
||||
|
||||
|
@ -636,15 +628,6 @@ static void io_task_refs_refill(struct io_uring_task *tctx)
|
|||
tctx->cached_refs += refill;
|
||||
}
|
||||
|
||||
static inline void io_get_task_refs(int nr)
|
||||
{
|
||||
struct io_uring_task *tctx = current->io_uring;
|
||||
|
||||
tctx->cached_refs -= nr;
|
||||
if (unlikely(tctx->cached_refs < 0))
|
||||
io_task_refs_refill(tctx);
|
||||
}
|
||||
|
||||
static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
|
||||
{
|
||||
struct io_uring_task *tctx = task->io_uring;
|
||||
|
@ -741,9 +724,8 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx)
|
|||
return &rings->cqes[off];
|
||||
}
|
||||
|
||||
static bool io_fill_cqe_aux(struct io_ring_ctx *ctx,
|
||||
u64 user_data, s32 res, u32 cflags,
|
||||
bool allow_overflow)
|
||||
bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
|
||||
bool allow_overflow)
|
||||
{
|
||||
struct io_uring_cqe *cqe;
|
||||
|
||||
|
@ -868,18 +850,13 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx,
|
|||
spin_unlock(&ctx->completion_lock);
|
||||
}
|
||||
|
||||
static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
|
||||
{
|
||||
return !ctx->submit_state.free_list.next;
|
||||
}
|
||||
|
||||
/*
|
||||
* A request might get retired back into the request caches even before opcode
|
||||
* handlers and io_issue_sqe() are done with it, e.g. inline completion path.
|
||||
* Because of that, io_alloc_req() should be called only under ->uring_lock
|
||||
* and with extra caution to not get a request that is still worked on.
|
||||
*/
|
||||
static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
|
||||
__cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
|
||||
__must_hold(&ctx->uring_lock)
|
||||
{
|
||||
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
|
||||
|
@ -920,21 +897,6 @@ static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
|
|||
return true;
|
||||
}
|
||||
|
||||
static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
|
||||
{
|
||||
if (unlikely(io_req_cache_empty(ctx)))
|
||||
return __io_alloc_req_refill(ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_wq_work_node *node;
|
||||
|
||||
node = wq_stack_extract(&ctx->submit_state.free_list);
|
||||
return container_of(node, struct io_kiocb, comp_list);
|
||||
}
|
||||
|
||||
static inline void io_dismantle_req(struct io_kiocb *req)
|
||||
{
|
||||
unsigned int flags = req->flags;
|
||||
|
@ -2500,6 +2462,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
|
|||
}
|
||||
#endif
|
||||
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
|
||||
WARN_ON_ONCE(ctx->notif_slots || ctx->nr_notif_slots);
|
||||
|
||||
io_mem_free(ctx->rings);
|
||||
io_mem_free(ctx->sq_sqes);
|
||||
|
@ -2676,6 +2639,7 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
|
|||
io_unregister_personality(ctx, index);
|
||||
if (ctx->rings)
|
||||
io_poll_remove_all(ctx, NULL, true);
|
||||
io_notif_unregister(ctx);
|
||||
mutex_unlock(&ctx->uring_lock);
|
||||
|
||||
/* failed during ring init, it couldn't have issued any requests */
|
||||
|
@ -3874,6 +3838,15 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
|
|||
break;
|
||||
ret = io_register_file_alloc_range(ctx, arg);
|
||||
break;
|
||||
case IORING_REGISTER_NOTIFIERS:
|
||||
ret = io_notif_register(ctx, arg, nr_args);
|
||||
break;
|
||||
case IORING_UNREGISTER_NOTIFIERS:
|
||||
ret = -EINVAL;
|
||||
if (arg || nr_args)
|
||||
break;
|
||||
ret = io_notif_unregister(ctx);
|
||||
break;
|
||||
default:
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
|
|
|
@ -33,6 +33,8 @@ void io_req_complete_post(struct io_kiocb *req);
|
|||
void __io_req_complete_post(struct io_kiocb *req);
|
||||
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
|
||||
bool allow_overflow);
|
||||
bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags,
|
||||
bool allow_overflow);
|
||||
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
|
||||
|
||||
struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
|
||||
|
@ -71,6 +73,9 @@ void io_wq_submit_work(struct io_wq_work *work);
|
|||
|
||||
void io_free_req(struct io_kiocb *req);
|
||||
void io_queue_next(struct io_kiocb *req);
|
||||
void __io_put_task(struct task_struct *task, int nr);
|
||||
void io_task_refs_refill(struct io_uring_task *tctx);
|
||||
bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
|
||||
|
||||
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
|
||||
bool cancel_all);
|
||||
|
@ -258,4 +263,42 @@ static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
|
|||
__io_commit_cqring_flush(ctx);
|
||||
}
|
||||
|
||||
/* must to be called somewhat shortly after putting a request */
|
||||
static inline void io_put_task(struct task_struct *task, int nr)
|
||||
{
|
||||
if (likely(task == current))
|
||||
task->io_uring->cached_refs += nr;
|
||||
else
|
||||
__io_put_task(task, nr);
|
||||
}
|
||||
|
||||
static inline void io_get_task_refs(int nr)
|
||||
{
|
||||
struct io_uring_task *tctx = current->io_uring;
|
||||
|
||||
tctx->cached_refs -= nr;
|
||||
if (unlikely(tctx->cached_refs < 0))
|
||||
io_task_refs_refill(tctx);
|
||||
}
|
||||
|
||||
static inline bool io_req_cache_empty(struct io_ring_ctx *ctx)
|
||||
{
|
||||
return !ctx->submit_state.free_list.next;
|
||||
}
|
||||
|
||||
static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx)
|
||||
{
|
||||
if (unlikely(io_req_cache_empty(ctx)))
|
||||
return __io_alloc_req_refill(ctx);
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx)
|
||||
{
|
||||
struct io_wq_work_node *node;
|
||||
|
||||
node = wq_stack_extract(&ctx->submit_state.free_list);
|
||||
return container_of(node, struct io_kiocb, comp_list);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
193
io_uring/net.c
193
io_uring/net.c
|
@ -14,6 +14,8 @@
|
|||
#include "kbuf.h"
|
||||
#include "alloc_cache.h"
|
||||
#include "net.h"
|
||||
#include "notif.h"
|
||||
#include "rsrc.h"
|
||||
|
||||
#if defined(CONFIG_NET)
|
||||
struct io_shutdown {
|
||||
|
@ -53,10 +55,21 @@ struct io_sr_msg {
|
|||
struct user_msghdr __user *umsg;
|
||||
void __user *buf;
|
||||
};
|
||||
int msg_flags;
|
||||
unsigned msg_flags;
|
||||
unsigned flags;
|
||||
size_t len;
|
||||
size_t done_io;
|
||||
unsigned int flags;
|
||||
};
|
||||
|
||||
struct io_sendzc {
|
||||
struct file *file;
|
||||
void __user *buf;
|
||||
size_t len;
|
||||
u16 slot_idx;
|
||||
unsigned msg_flags;
|
||||
unsigned flags;
|
||||
unsigned addr_len;
|
||||
void __user *addr;
|
||||
};
|
||||
|
||||
#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
|
||||
|
@ -294,6 +307,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
|
|||
msg.msg_control = NULL;
|
||||
msg.msg_controllen = 0;
|
||||
msg.msg_namelen = 0;
|
||||
msg.msg_ubuf = NULL;
|
||||
|
||||
flags = sr->msg_flags;
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
|
@ -783,6 +797,7 @@ retry_multishot:
|
|||
msg.msg_flags = 0;
|
||||
msg.msg_controllen = 0;
|
||||
msg.msg_iocb = NULL;
|
||||
msg.msg_ubuf = NULL;
|
||||
|
||||
flags = sr->msg_flags;
|
||||
if (force_nonblock)
|
||||
|
@ -832,6 +847,180 @@ out_free:
|
|||
return ret;
|
||||
}
|
||||
|
||||
int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_sendzc *zc = io_kiocb_to_cmd(req);
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
||||
if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))
|
||||
return -EINVAL;
|
||||
|
||||
zc->flags = READ_ONCE(sqe->ioprio);
|
||||
if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST |
|
||||
IORING_RECVSEND_FIXED_BUF | IORING_RECVSEND_NOTIF_FLUSH))
|
||||
return -EINVAL;
|
||||
if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
|
||||
unsigned idx = READ_ONCE(sqe->buf_index);
|
||||
|
||||
if (unlikely(idx >= ctx->nr_user_bufs))
|
||||
return -EFAULT;
|
||||
idx = array_index_nospec(idx, ctx->nr_user_bufs);
|
||||
req->imu = READ_ONCE(ctx->user_bufs[idx]);
|
||||
io_req_set_rsrc_node(req, ctx, 0);
|
||||
}
|
||||
|
||||
zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr));
|
||||
zc->len = READ_ONCE(sqe->len);
|
||||
zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
|
||||
zc->slot_idx = READ_ONCE(sqe->notification_idx);
|
||||
if (zc->msg_flags & MSG_DONTWAIT)
|
||||
req->flags |= REQ_F_NOWAIT;
|
||||
|
||||
zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2));
|
||||
zc->addr_len = READ_ONCE(sqe->addr_len);
|
||||
|
||||
#ifdef CONFIG_COMPAT
|
||||
if (req->ctx->compat)
|
||||
zc->msg_flags |= MSG_CMSG_COMPAT;
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_sg_from_iter(struct sock *sk, struct sk_buff *skb,
|
||||
struct iov_iter *from, size_t length)
|
||||
{
|
||||
struct skb_shared_info *shinfo = skb_shinfo(skb);
|
||||
int frag = shinfo->nr_frags;
|
||||
int ret = 0;
|
||||
struct bvec_iter bi;
|
||||
ssize_t copied = 0;
|
||||
unsigned long truesize = 0;
|
||||
|
||||
if (!shinfo->nr_frags)
|
||||
shinfo->flags |= SKBFL_MANAGED_FRAG_REFS;
|
||||
|
||||
if (!skb_zcopy_managed(skb) || !iov_iter_is_bvec(from)) {
|
||||
skb_zcopy_downgrade_managed(skb);
|
||||
return __zerocopy_sg_from_iter(NULL, sk, skb, from, length);
|
||||
}
|
||||
|
||||
bi.bi_size = min(from->count, length);
|
||||
bi.bi_bvec_done = from->iov_offset;
|
||||
bi.bi_idx = 0;
|
||||
|
||||
while (bi.bi_size && frag < MAX_SKB_FRAGS) {
|
||||
struct bio_vec v = mp_bvec_iter_bvec(from->bvec, bi);
|
||||
|
||||
copied += v.bv_len;
|
||||
truesize += PAGE_ALIGN(v.bv_len + v.bv_offset);
|
||||
__skb_fill_page_desc_noacc(shinfo, frag++, v.bv_page,
|
||||
v.bv_offset, v.bv_len);
|
||||
bvec_iter_advance_single(from->bvec, &bi, v.bv_len);
|
||||
}
|
||||
if (bi.bi_size)
|
||||
ret = -EMSGSIZE;
|
||||
|
||||
shinfo->nr_frags = frag;
|
||||
from->bvec += bi.bi_idx;
|
||||
from->nr_segs -= bi.bi_idx;
|
||||
from->count = bi.bi_size;
|
||||
from->iov_offset = bi.bi_bvec_done;
|
||||
|
||||
skb->data_len += copied;
|
||||
skb->len += copied;
|
||||
skb->truesize += truesize;
|
||||
|
||||
if (sk && sk->sk_type == SOCK_STREAM) {
|
||||
sk_wmem_queued_add(sk, truesize);
|
||||
if (!skb_zcopy_pure(skb))
|
||||
sk_mem_charge(sk, truesize);
|
||||
} else {
|
||||
refcount_add(truesize, &skb->sk->sk_wmem_alloc);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct sockaddr_storage address;
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
struct io_sendzc *zc = io_kiocb_to_cmd(req);
|
||||
struct io_notif_slot *notif_slot;
|
||||
struct io_kiocb *notif;
|
||||
struct msghdr msg;
|
||||
struct iovec iov;
|
||||
struct socket *sock;
|
||||
unsigned msg_flags;
|
||||
int ret, min_ret = 0;
|
||||
|
||||
if (!(req->flags & REQ_F_POLLED) &&
|
||||
(zc->flags & IORING_RECVSEND_POLL_FIRST))
|
||||
return -EAGAIN;
|
||||
|
||||
if (issue_flags & IO_URING_F_UNLOCKED)
|
||||
return -EAGAIN;
|
||||
sock = sock_from_file(req->file);
|
||||
if (unlikely(!sock))
|
||||
return -ENOTSOCK;
|
||||
|
||||
notif_slot = io_get_notif_slot(ctx, zc->slot_idx);
|
||||
if (!notif_slot)
|
||||
return -EINVAL;
|
||||
notif = io_get_notif(ctx, notif_slot);
|
||||
if (!notif)
|
||||
return -ENOMEM;
|
||||
|
||||
msg.msg_name = NULL;
|
||||
msg.msg_control = NULL;
|
||||
msg.msg_controllen = 0;
|
||||
msg.msg_namelen = 0;
|
||||
|
||||
if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
|
||||
ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu,
|
||||
(u64)(uintptr_t)zc->buf, zc->len);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
} else {
|
||||
ret = import_single_range(WRITE, zc->buf, zc->len, &iov,
|
||||
&msg.msg_iter);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
ret = io_notif_account_mem(notif, zc->len);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (zc->addr) {
|
||||
ret = move_addr_to_kernel(zc->addr, zc->addr_len, &address);
|
||||
if (unlikely(ret < 0))
|
||||
return ret;
|
||||
msg.msg_name = (struct sockaddr *)&address;
|
||||
msg.msg_namelen = zc->addr_len;
|
||||
}
|
||||
|
||||
msg_flags = zc->msg_flags | MSG_ZEROCOPY;
|
||||
if (issue_flags & IO_URING_F_NONBLOCK)
|
||||
msg_flags |= MSG_DONTWAIT;
|
||||
if (msg_flags & MSG_WAITALL)
|
||||
min_ret = iov_iter_count(&msg.msg_iter);
|
||||
|
||||
msg.msg_flags = msg_flags;
|
||||
msg.msg_ubuf = &io_notif_to_data(notif)->uarg;
|
||||
msg.sg_from_iter = io_sg_from_iter;
|
||||
ret = sock_sendmsg(sock, &msg);
|
||||
|
||||
if (unlikely(ret < min_ret)) {
|
||||
if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
|
||||
return -EAGAIN;
|
||||
return ret == -ERESTARTSYS ? -EINTR : ret;
|
||||
}
|
||||
|
||||
if (zc->flags & IORING_RECVSEND_NOTIF_FLUSH)
|
||||
io_notif_slot_flush_submit(notif_slot, 0);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_accept *accept = io_kiocb_to_cmd(req);
|
||||
|
|
|
@ -52,6 +52,9 @@ int io_connect_prep_async(struct io_kiocb *req);
|
|||
int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_connect(struct io_kiocb *req, unsigned int issue_flags);
|
||||
|
||||
int io_sendzc(struct io_kiocb *req, unsigned int issue_flags);
|
||||
int io_sendzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
|
||||
void io_netmsg_cache_free(struct io_cache_entry *entry);
|
||||
#else
|
||||
static inline void io_netmsg_cache_free(struct io_cache_entry *entry)
|
||||
|
|
|
@ -0,0 +1,159 @@
|
|||
#include <linux/kernel.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/net.h>
|
||||
#include <linux/io_uring.h>
|
||||
|
||||
#include "io_uring.h"
|
||||
#include "notif.h"
|
||||
#include "rsrc.h"
|
||||
|
||||
static void __io_notif_complete_tw(struct io_kiocb *notif, bool *locked)
|
||||
{
|
||||
struct io_notif_data *nd = io_notif_to_data(notif);
|
||||
struct io_ring_ctx *ctx = notif->ctx;
|
||||
|
||||
if (nd->account_pages && ctx->user) {
|
||||
__io_unaccount_mem(ctx->user, nd->account_pages);
|
||||
nd->account_pages = 0;
|
||||
}
|
||||
io_req_task_complete(notif, locked);
|
||||
}
|
||||
|
||||
static inline void io_notif_complete(struct io_kiocb *notif)
|
||||
__must_hold(¬if->ctx->uring_lock)
|
||||
{
|
||||
bool locked = true;
|
||||
|
||||
__io_notif_complete_tw(notif, &locked);
|
||||
}
|
||||
|
||||
static void io_uring_tx_zerocopy_callback(struct sk_buff *skb,
|
||||
struct ubuf_info *uarg,
|
||||
bool success)
|
||||
{
|
||||
struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg);
|
||||
struct io_kiocb *notif = cmd_to_io_kiocb(nd);
|
||||
|
||||
if (refcount_dec_and_test(&uarg->refcnt)) {
|
||||
notif->io_task_work.func = __io_notif_complete_tw;
|
||||
io_req_task_work_add(notif);
|
||||
}
|
||||
}
|
||||
|
||||
struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
|
||||
struct io_notif_slot *slot)
|
||||
__must_hold(&ctx->uring_lock)
|
||||
{
|
||||
struct io_kiocb *notif;
|
||||
struct io_notif_data *nd;
|
||||
|
||||
if (unlikely(!io_alloc_req_refill(ctx)))
|
||||
return NULL;
|
||||
notif = io_alloc_req(ctx);
|
||||
notif->opcode = IORING_OP_NOP;
|
||||
notif->flags = 0;
|
||||
notif->file = NULL;
|
||||
notif->task = current;
|
||||
io_get_task_refs(1);
|
||||
notif->rsrc_node = NULL;
|
||||
io_req_set_rsrc_node(notif, ctx, 0);
|
||||
notif->cqe.user_data = slot->tag;
|
||||
notif->cqe.flags = slot->seq++;
|
||||
notif->cqe.res = 0;
|
||||
|
||||
nd = io_notif_to_data(notif);
|
||||
nd->account_pages = 0;
|
||||
nd->uarg.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
|
||||
nd->uarg.callback = io_uring_tx_zerocopy_callback;
|
||||
/* master ref owned by io_notif_slot, will be dropped on flush */
|
||||
refcount_set(&nd->uarg.refcnt, 1);
|
||||
return notif;
|
||||
}
|
||||
|
||||
void io_notif_slot_flush(struct io_notif_slot *slot)
|
||||
__must_hold(&ctx->uring_lock)
|
||||
{
|
||||
struct io_kiocb *notif = slot->notif;
|
||||
struct io_notif_data *nd = io_notif_to_data(notif);
|
||||
|
||||
slot->notif = NULL;
|
||||
|
||||
/* drop slot's master ref */
|
||||
if (refcount_dec_and_test(&nd->uarg.refcnt))
|
||||
io_notif_complete(notif);
|
||||
}
|
||||
|
||||
__cold int io_notif_unregister(struct io_ring_ctx *ctx)
|
||||
__must_hold(&ctx->uring_lock)
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!ctx->notif_slots)
|
||||
return -ENXIO;
|
||||
|
||||
for (i = 0; i < ctx->nr_notif_slots; i++) {
|
||||
struct io_notif_slot *slot = &ctx->notif_slots[i];
|
||||
struct io_kiocb *notif = slot->notif;
|
||||
struct io_notif_data *nd;
|
||||
|
||||
if (!notif)
|
||||
continue;
|
||||
nd = io_kiocb_to_cmd(notif);
|
||||
slot->notif = NULL;
|
||||
if (!refcount_dec_and_test(&nd->uarg.refcnt))
|
||||
continue;
|
||||
notif->io_task_work.func = __io_notif_complete_tw;
|
||||
io_req_task_work_add(notif);
|
||||
}
|
||||
|
||||
kvfree(ctx->notif_slots);
|
||||
ctx->notif_slots = NULL;
|
||||
ctx->nr_notif_slots = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
__cold int io_notif_register(struct io_ring_ctx *ctx,
|
||||
void __user *arg, unsigned int size)
|
||||
__must_hold(&ctx->uring_lock)
|
||||
{
|
||||
struct io_uring_notification_slot __user *slots;
|
||||
struct io_uring_notification_slot slot;
|
||||
struct io_uring_notification_register reg;
|
||||
unsigned i;
|
||||
|
||||
BUILD_BUG_ON(sizeof(struct io_notif_data) > 64);
|
||||
|
||||
if (ctx->nr_notif_slots)
|
||||
return -EBUSY;
|
||||
if (size != sizeof(reg))
|
||||
return -EINVAL;
|
||||
if (copy_from_user(®, arg, sizeof(reg)))
|
||||
return -EFAULT;
|
||||
if (!reg.nr_slots || reg.nr_slots > IORING_MAX_NOTIF_SLOTS)
|
||||
return -EINVAL;
|
||||
if (reg.resv || reg.resv2 || reg.resv3)
|
||||
return -EINVAL;
|
||||
|
||||
slots = u64_to_user_ptr(reg.data);
|
||||
ctx->notif_slots = kvcalloc(reg.nr_slots, sizeof(ctx->notif_slots[0]),
|
||||
GFP_KERNEL_ACCOUNT);
|
||||
if (!ctx->notif_slots)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < reg.nr_slots; i++, ctx->nr_notif_slots++) {
|
||||
struct io_notif_slot *notif_slot = &ctx->notif_slots[i];
|
||||
|
||||
if (copy_from_user(&slot, &slots[i], sizeof(slot))) {
|
||||
io_notif_unregister(ctx);
|
||||
return -EFAULT;
|
||||
}
|
||||
if (slot.resv[0] | slot.resv[1] | slot.resv[2]) {
|
||||
io_notif_unregister(ctx);
|
||||
return -EINVAL;
|
||||
}
|
||||
notif_slot->tag = slot.tag;
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/net.h>
|
||||
#include <linux/uio.h>
|
||||
#include <net/sock.h>
|
||||
#include <linux/nospec.h>
|
||||
|
||||
#include "rsrc.h"
|
||||
|
||||
#define IO_NOTIF_SPLICE_BATCH 32
|
||||
#define IORING_MAX_NOTIF_SLOTS (1U << 10)
|
||||
|
||||
struct io_notif_data {
|
||||
struct file *file;
|
||||
struct ubuf_info uarg;
|
||||
unsigned long account_pages;
|
||||
};
|
||||
|
||||
struct io_notif_slot {
|
||||
/*
|
||||
* Current/active notifier. A slot holds only one active notifier at a
|
||||
* time and keeps one reference to it. Flush releases the reference and
|
||||
* lazily replaces it with a new notifier.
|
||||
*/
|
||||
struct io_kiocb *notif;
|
||||
|
||||
/*
|
||||
* Default ->user_data for this slot notifiers CQEs
|
||||
*/
|
||||
u64 tag;
|
||||
/*
|
||||
* Notifiers of a slot live in generations, we create a new notifier
|
||||
* only after flushing the previous one. Track the sequential number
|
||||
* for all notifiers and copy it into notifiers's cqe->cflags
|
||||
*/
|
||||
u32 seq;
|
||||
};
|
||||
|
||||
int io_notif_register(struct io_ring_ctx *ctx,
|
||||
void __user *arg, unsigned int size);
|
||||
int io_notif_unregister(struct io_ring_ctx *ctx);
|
||||
|
||||
void io_notif_slot_flush(struct io_notif_slot *slot);
|
||||
struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx,
|
||||
struct io_notif_slot *slot);
|
||||
|
||||
static inline struct io_notif_data *io_notif_to_data(struct io_kiocb *notif)
|
||||
{
|
||||
return io_kiocb_to_cmd(notif);
|
||||
}
|
||||
|
||||
static inline struct io_kiocb *io_get_notif(struct io_ring_ctx *ctx,
|
||||
struct io_notif_slot *slot)
|
||||
{
|
||||
if (!slot->notif)
|
||||
slot->notif = io_alloc_notif(ctx, slot);
|
||||
return slot->notif;
|
||||
}
|
||||
|
||||
static inline struct io_notif_slot *io_get_notif_slot(struct io_ring_ctx *ctx,
|
||||
unsigned idx)
|
||||
__must_hold(&ctx->uring_lock)
|
||||
{
|
||||
if (idx >= ctx->nr_notif_slots)
|
||||
return NULL;
|
||||
idx = array_index_nospec(idx, ctx->nr_notif_slots);
|
||||
return &ctx->notif_slots[idx];
|
||||
}
|
||||
|
||||
static inline void io_notif_slot_flush_submit(struct io_notif_slot *slot,
|
||||
unsigned int issue_flags)
|
||||
{
|
||||
io_notif_slot_flush(slot);
|
||||
}
|
||||
|
||||
static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len)
|
||||
{
|
||||
struct io_ring_ctx *ctx = notif->ctx;
|
||||
struct io_notif_data *nd = io_notif_to_data(notif);
|
||||
unsigned nr_pages = (len >> PAGE_SHIFT) + 2;
|
||||
int ret;
|
||||
|
||||
if (ctx->user) {
|
||||
ret = __io_account_mem(ctx->user, nr_pages);
|
||||
if (ret)
|
||||
return ret;
|
||||
nd->account_pages += nr_pages;
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -246,12 +246,13 @@ const struct io_op_def io_op_defs[] = {
|
|||
.prep = io_close_prep,
|
||||
.issue = io_close,
|
||||
},
|
||||
[IORING_OP_FILES_UPDATE] = {
|
||||
[IORING_OP_RSRC_UPDATE] = {
|
||||
.audit_skip = 1,
|
||||
.iopoll = 1,
|
||||
.name = "FILES_UPDATE",
|
||||
.prep = io_files_update_prep,
|
||||
.issue = io_files_update,
|
||||
.name = "RSRC_UPDATE",
|
||||
.prep = io_rsrc_update_prep,
|
||||
.issue = io_rsrc_update,
|
||||
.ioprio = 1,
|
||||
},
|
||||
[IORING_OP_STATX] = {
|
||||
.audit_skip = 1,
|
||||
|
@ -470,6 +471,21 @@ const struct io_op_def io_op_defs[] = {
|
|||
.issue = io_uring_cmd,
|
||||
.prep_async = io_uring_cmd_prep_async,
|
||||
},
|
||||
[IORING_OP_SENDZC_NOTIF] = {
|
||||
.name = "SENDZC_NOTIF",
|
||||
.needs_file = 1,
|
||||
.unbound_nonreg_file = 1,
|
||||
.pollout = 1,
|
||||
.audit_skip = 1,
|
||||
.ioprio = 1,
|
||||
#if defined(CONFIG_NET)
|
||||
.prep = io_sendzc_prep,
|
||||
.issue = io_sendzc,
|
||||
#else
|
||||
.prep = io_eopnotsupp_prep,
|
||||
#endif
|
||||
|
||||
},
|
||||
};
|
||||
|
||||
const char *io_uring_get_opcode(u8 opcode)
|
||||
|
|
|
@ -15,12 +15,14 @@
|
|||
#include "io_uring.h"
|
||||
#include "openclose.h"
|
||||
#include "rsrc.h"
|
||||
#include "notif.h"
|
||||
|
||||
struct io_rsrc_update {
|
||||
struct file *file;
|
||||
u64 arg;
|
||||
u32 nr_args;
|
||||
u32 offset;
|
||||
int type;
|
||||
};
|
||||
|
||||
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
|
||||
|
@ -42,17 +44,13 @@ void io_rsrc_refs_drop(struct io_ring_ctx *ctx)
|
|||
}
|
||||
}
|
||||
|
||||
static inline void __io_unaccount_mem(struct user_struct *user,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
atomic_long_sub(nr_pages, &user->locked_vm);
|
||||
}
|
||||
|
||||
static inline int __io_account_mem(struct user_struct *user,
|
||||
unsigned long nr_pages)
|
||||
int __io_account_mem(struct user_struct *user, unsigned long nr_pages)
|
||||
{
|
||||
unsigned long page_limit, cur_pages, new_pages;
|
||||
|
||||
if (!nr_pages)
|
||||
return 0;
|
||||
|
||||
/* Don't allow more pages than we can safely lock */
|
||||
page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
|
||||
|
||||
|
@ -657,7 +655,7 @@ __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
|
|||
return -EINVAL;
|
||||
}
|
||||
|
||||
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
||||
{
|
||||
struct io_rsrc_update *up = io_kiocb_to_cmd(req);
|
||||
|
||||
|
@ -671,6 +669,7 @@ int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
|
|||
if (!up->nr_args)
|
||||
return -EINVAL;
|
||||
up->arg = READ_ONCE(sqe->addr);
|
||||
up->type = READ_ONCE(sqe->ioprio);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -713,7 +712,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
|
|||
return ret;
|
||||
}
|
||||
|
||||
int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
|
||||
static int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_rsrc_update *up = io_kiocb_to_cmd(req);
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
|
@ -742,6 +741,54 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
|
|||
return IOU_OK;
|
||||
}
|
||||
|
||||
static int io_notif_update(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_rsrc_update *up = io_kiocb_to_cmd(req);
|
||||
struct io_ring_ctx *ctx = req->ctx;
|
||||
unsigned len = up->nr_args;
|
||||
unsigned idx_end, idx = up->offset;
|
||||
int ret = 0;
|
||||
|
||||
io_ring_submit_lock(ctx, issue_flags);
|
||||
if (unlikely(check_add_overflow(idx, len, &idx_end))) {
|
||||
ret = -EOVERFLOW;
|
||||
goto out;
|
||||
}
|
||||
if (unlikely(idx_end > ctx->nr_notif_slots)) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
for (; idx < idx_end; idx++) {
|
||||
struct io_notif_slot *slot = &ctx->notif_slots[idx];
|
||||
|
||||
if (!slot->notif)
|
||||
continue;
|
||||
if (up->arg)
|
||||
slot->tag = up->arg;
|
||||
io_notif_slot_flush_submit(slot, issue_flags);
|
||||
}
|
||||
out:
|
||||
io_ring_submit_unlock(ctx, issue_flags);
|
||||
if (ret < 0)
|
||||
req_set_fail(req);
|
||||
io_req_set_res(req, ret, 0);
|
||||
return IOU_OK;
|
||||
}
|
||||
|
||||
int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags)
|
||||
{
|
||||
struct io_rsrc_update *up = io_kiocb_to_cmd(req);
|
||||
|
||||
switch (up->type) {
|
||||
case IORING_RSRC_UPDATE_FILES:
|
||||
return io_files_update(req, issue_flags);
|
||||
case IORING_RSRC_UPDATE_NOTIF:
|
||||
return io_notif_update(req, issue_flags);
|
||||
}
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
|
||||
struct io_rsrc_node *node, void *rsrc)
|
||||
{
|
||||
|
|
|
@ -135,6 +135,13 @@ static inline void io_req_put_rsrc_locked(struct io_kiocb *req,
|
|||
}
|
||||
}
|
||||
|
||||
static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx)
|
||||
{
|
||||
ctx->rsrc_cached_refs--;
|
||||
if (unlikely(ctx->rsrc_cached_refs < 0))
|
||||
io_rsrc_refs_refill(ctx);
|
||||
}
|
||||
|
||||
static inline void io_req_set_rsrc_node(struct io_kiocb *req,
|
||||
struct io_ring_ctx *ctx,
|
||||
unsigned int issue_flags)
|
||||
|
@ -144,9 +151,8 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req,
|
|||
|
||||
if (!(issue_flags & IO_URING_F_UNLOCKED)) {
|
||||
lockdep_assert_held(&ctx->uring_lock);
|
||||
ctx->rsrc_cached_refs--;
|
||||
if (unlikely(ctx->rsrc_cached_refs < 0))
|
||||
io_rsrc_refs_refill(ctx);
|
||||
|
||||
io_charge_rsrc_node(ctx);
|
||||
} else {
|
||||
percpu_ref_get(&req->rsrc_node->refs);
|
||||
}
|
||||
|
@ -161,6 +167,15 @@ static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
|
|||
return &data->tags[table_idx][off];
|
||||
}
|
||||
|
||||
int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
|
||||
int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
int io_rsrc_update(struct io_kiocb *req, unsigned int issue_flags);
|
||||
int io_rsrc_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe);
|
||||
|
||||
int __io_account_mem(struct user_struct *user, unsigned long nr_pages);
|
||||
|
||||
static inline void __io_unaccount_mem(struct user_struct *user,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
atomic_long_sub(nr_pages, &user->locked_vm);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,31 +1,5 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/llist.h>
|
||||
|
||||
/*
|
||||
* Arbitrary limit, can be raised if need be
|
||||
*/
|
||||
#define IO_RINGFD_REG_MAX 16
|
||||
|
||||
struct io_uring_task {
|
||||
/* submission side */
|
||||
int cached_refs;
|
||||
const struct io_ring_ctx *last;
|
||||
struct io_wq *io_wq;
|
||||
struct file *registered_rings[IO_RINGFD_REG_MAX];
|
||||
|
||||
struct xarray xa;
|
||||
struct wait_queue_head wait;
|
||||
atomic_t in_idle;
|
||||
atomic_t inflight_tracked;
|
||||
struct percpu_counter inflight;
|
||||
|
||||
struct { /* task_work */
|
||||
struct llist_head task_list;
|
||||
struct callback_head task_work;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
};
|
||||
|
||||
struct io_tctx_node {
|
||||
struct list_head ctx_node;
|
||||
struct task_struct *task;
|
||||
|
|
|
@ -75,6 +75,7 @@ int __get_compat_msghdr(struct msghdr *kmsg,
|
|||
return -EMSGSIZE;
|
||||
|
||||
kmsg->msg_iocb = NULL;
|
||||
kmsg->msg_ubuf = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -613,10 +613,16 @@ fault:
|
|||
}
|
||||
EXPORT_SYMBOL(skb_copy_datagram_from_iter);
|
||||
|
||||
int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
|
||||
struct iov_iter *from, size_t length)
|
||||
int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
|
||||
struct sk_buff *skb, struct iov_iter *from,
|
||||
size_t length)
|
||||
{
|
||||
int frag = skb_shinfo(skb)->nr_frags;
|
||||
int frag;
|
||||
|
||||
if (msg && msg->msg_ubuf && msg->sg_from_iter)
|
||||
return msg->sg_from_iter(sk, skb, from, length);
|
||||
|
||||
frag = skb_shinfo(skb)->nr_frags;
|
||||
|
||||
while (length && iov_iter_count(from)) {
|
||||
struct page *pages[MAX_SKB_FRAGS];
|
||||
|
@ -702,7 +708,7 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
|
|||
if (skb_copy_datagram_from_iter(skb, 0, from, copy))
|
||||
return -EFAULT;
|
||||
|
||||
return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
|
||||
return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
|
||||
}
|
||||
EXPORT_SYMBOL(zerocopy_sg_from_iter);
|
||||
|
||||
|
|
|
@ -666,11 +666,18 @@ static void skb_release_data(struct sk_buff *skb)
|
|||
&shinfo->dataref))
|
||||
goto exit;
|
||||
|
||||
skb_zcopy_clear(skb, true);
|
||||
if (skb_zcopy(skb)) {
|
||||
bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
|
||||
|
||||
skb_zcopy_clear(skb, true);
|
||||
if (skip_unref)
|
||||
goto free_head;
|
||||
}
|
||||
|
||||
for (i = 0; i < shinfo->nr_frags; i++)
|
||||
__skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
|
||||
|
||||
free_head:
|
||||
if (shinfo->frag_list)
|
||||
kfree_skb_list(shinfo->frag_list);
|
||||
|
||||
|
@ -895,7 +902,10 @@ EXPORT_SYMBOL(skb_dump);
|
|||
*/
|
||||
void skb_tx_error(struct sk_buff *skb)
|
||||
{
|
||||
skb_zcopy_clear(skb, true);
|
||||
if (skb) {
|
||||
skb_zcopy_downgrade_managed(skb);
|
||||
skb_zcopy_clear(skb, true);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(skb_tx_error);
|
||||
|
||||
|
@ -1193,7 +1203,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
|
|||
uarg->len = 1;
|
||||
uarg->bytelen = size;
|
||||
uarg->zerocopy = 1;
|
||||
uarg->flags = SKBFL_ZEROCOPY_FRAG;
|
||||
uarg->flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
|
||||
refcount_set(&uarg->refcnt, 1);
|
||||
sock_hold(sk);
|
||||
|
||||
|
@ -1212,6 +1222,10 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
|
|||
const u32 byte_limit = 1 << 19; /* limit to a few TSO */
|
||||
u32 bytelen, next;
|
||||
|
||||
/* there might be non MSG_ZEROCOPY users */
|
||||
if (uarg->callback != msg_zerocopy_callback)
|
||||
return NULL;
|
||||
|
||||
/* realloc only when socket is locked (TCP, UDP cork),
|
||||
* so uarg->len and sk_zckey access is serialized
|
||||
*/
|
||||
|
@ -1354,7 +1368,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
|
|||
if (orig_uarg && uarg != orig_uarg)
|
||||
return -EEXIST;
|
||||
|
||||
err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
|
||||
err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
|
||||
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
|
||||
struct sock *save_sk = skb->sk;
|
||||
|
||||
|
@ -1371,6 +1385,16 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
|
||||
|
||||
void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
|
||||
{
|
||||
int i;
|
||||
|
||||
skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
|
||||
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
|
||||
skb_frag_ref(skb, i);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);
|
||||
|
||||
static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
|
@ -1688,6 +1712,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
|
|||
|
||||
BUG_ON(skb_shared(skb));
|
||||
|
||||
skb_zcopy_downgrade_managed(skb);
|
||||
|
||||
size = SKB_DATA_ALIGN(size);
|
||||
|
||||
if (skb_pfmemalloc(skb))
|
||||
|
@ -3484,6 +3510,8 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
|
|||
int pos = skb_headlen(skb);
|
||||
const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
|
||||
|
||||
skb_zcopy_downgrade_managed(skb);
|
||||
|
||||
skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
|
||||
skb_zerocopy_clone(skb1, skb, 0);
|
||||
if (len < pos) /* Split line is inside header. */
|
||||
|
@ -3837,6 +3865,7 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
|
|||
if (skb_can_coalesce(skb, i, page, offset)) {
|
||||
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
|
||||
} else if (i < MAX_SKB_FRAGS) {
|
||||
skb_zcopy_downgrade_managed(skb);
|
||||
get_page(page);
|
||||
skb_fill_page_desc(skb, i, page, offset, size);
|
||||
} else {
|
||||
|
|
|
@ -969,7 +969,6 @@ static int __ip_append_data(struct sock *sk,
|
|||
struct inet_sock *inet = inet_sk(sk);
|
||||
struct ubuf_info *uarg = NULL;
|
||||
struct sk_buff *skb;
|
||||
|
||||
struct ip_options *opt = cork->opt;
|
||||
int hh_len;
|
||||
int exthdrlen;
|
||||
|
@ -977,6 +976,7 @@ static int __ip_append_data(struct sock *sk,
|
|||
int copy;
|
||||
int err;
|
||||
int offset = 0;
|
||||
bool zc = false;
|
||||
unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
|
||||
int csummode = CHECKSUM_NONE;
|
||||
struct rtable *rt = (struct rtable *)cork->dst;
|
||||
|
@ -1017,17 +1017,35 @@ static int __ip_append_data(struct sock *sk,
|
|||
(!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
|
||||
csummode = CHECKSUM_PARTIAL;
|
||||
|
||||
if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
|
||||
uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
|
||||
if (!uarg)
|
||||
return -ENOBUFS;
|
||||
extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
|
||||
if (rt->dst.dev->features & NETIF_F_SG &&
|
||||
csummode == CHECKSUM_PARTIAL) {
|
||||
paged = true;
|
||||
} else {
|
||||
uarg->zerocopy = 0;
|
||||
skb_zcopy_set(skb, uarg, &extra_uref);
|
||||
if ((flags & MSG_ZEROCOPY) && length) {
|
||||
struct msghdr *msg = from;
|
||||
|
||||
if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
|
||||
if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
|
||||
return -EINVAL;
|
||||
|
||||
/* Leave uarg NULL if can't zerocopy, callers should
|
||||
* be able to handle it.
|
||||
*/
|
||||
if ((rt->dst.dev->features & NETIF_F_SG) &&
|
||||
csummode == CHECKSUM_PARTIAL) {
|
||||
paged = true;
|
||||
zc = true;
|
||||
uarg = msg->msg_ubuf;
|
||||
}
|
||||
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
|
||||
uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
|
||||
if (!uarg)
|
||||
return -ENOBUFS;
|
||||
extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
|
||||
if (rt->dst.dev->features & NETIF_F_SG &&
|
||||
csummode == CHECKSUM_PARTIAL) {
|
||||
paged = true;
|
||||
zc = true;
|
||||
} else {
|
||||
uarg->zerocopy = 0;
|
||||
skb_zcopy_set(skb, uarg, &extra_uref);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1091,9 +1109,12 @@ alloc_new_skb:
|
|||
(fraglen + alloc_extra < SKB_MAX_ALLOC ||
|
||||
!(rt->dst.dev->features & NETIF_F_SG)))
|
||||
alloclen = fraglen;
|
||||
else {
|
||||
else if (!zc) {
|
||||
alloclen = min_t(int, fraglen, MAX_HEADER);
|
||||
pagedlen = fraglen - alloclen;
|
||||
} else {
|
||||
alloclen = fragheaderlen + transhdrlen;
|
||||
pagedlen = datalen - transhdrlen;
|
||||
}
|
||||
|
||||
alloclen += alloc_extra;
|
||||
|
@ -1188,13 +1209,14 @@ alloc_new_skb:
|
|||
err = -EFAULT;
|
||||
goto error;
|
||||
}
|
||||
} else if (!uarg || !uarg->zerocopy) {
|
||||
} else if (!zc) {
|
||||
int i = skb_shinfo(skb)->nr_frags;
|
||||
|
||||
err = -ENOMEM;
|
||||
if (!sk_page_frag_refill(sk, pfrag))
|
||||
goto error;
|
||||
|
||||
skb_zcopy_downgrade_managed(skb);
|
||||
if (!skb_can_coalesce(skb, i, pfrag->page,
|
||||
pfrag->offset)) {
|
||||
err = -EMSGSIZE;
|
||||
|
|
|
@ -1203,17 +1203,23 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
|
|||
|
||||
flags = msg->msg_flags;
|
||||
|
||||
if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
|
||||
if ((flags & MSG_ZEROCOPY) && size) {
|
||||
skb = tcp_write_queue_tail(sk);
|
||||
uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
|
||||
if (!uarg) {
|
||||
err = -ENOBUFS;
|
||||
goto out_err;
|
||||
}
|
||||
|
||||
zc = sk->sk_route_caps & NETIF_F_SG;
|
||||
if (!zc)
|
||||
uarg->zerocopy = 0;
|
||||
if (msg->msg_ubuf) {
|
||||
uarg = msg->msg_ubuf;
|
||||
net_zcopy_get(uarg);
|
||||
zc = sk->sk_route_caps & NETIF_F_SG;
|
||||
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
|
||||
uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
|
||||
if (!uarg) {
|
||||
err = -ENOBUFS;
|
||||
goto out_err;
|
||||
}
|
||||
zc = sk->sk_route_caps & NETIF_F_SG;
|
||||
if (!zc)
|
||||
uarg->zerocopy = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
|
||||
|
@ -1336,8 +1342,13 @@ new_segment:
|
|||
|
||||
copy = min_t(int, copy, pfrag->size - pfrag->offset);
|
||||
|
||||
if (tcp_downgrade_zcopy_pure(sk, skb) ||
|
||||
!sk_wmem_schedule(sk, copy))
|
||||
if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) {
|
||||
if (tcp_downgrade_zcopy_pure(sk, skb))
|
||||
goto wait_for_space;
|
||||
skb_zcopy_downgrade_managed(skb);
|
||||
}
|
||||
|
||||
if (!sk_wmem_schedule(sk, copy))
|
||||
goto wait_for_space;
|
||||
|
||||
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
|
||||
|
|
|
@ -1464,6 +1464,7 @@ static int __ip6_append_data(struct sock *sk,
|
|||
int copy;
|
||||
int err;
|
||||
int offset = 0;
|
||||
bool zc = false;
|
||||
u32 tskey = 0;
|
||||
struct rt6_info *rt = (struct rt6_info *)cork->dst;
|
||||
struct ipv6_txoptions *opt = v6_cork->opt;
|
||||
|
@ -1541,17 +1542,35 @@ emsgsize:
|
|||
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
|
||||
csummode = CHECKSUM_PARTIAL;
|
||||
|
||||
if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
|
||||
uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
|
||||
if (!uarg)
|
||||
return -ENOBUFS;
|
||||
extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
|
||||
if (rt->dst.dev->features & NETIF_F_SG &&
|
||||
csummode == CHECKSUM_PARTIAL) {
|
||||
paged = true;
|
||||
} else {
|
||||
uarg->zerocopy = 0;
|
||||
skb_zcopy_set(skb, uarg, &extra_uref);
|
||||
if ((flags & MSG_ZEROCOPY) && length) {
|
||||
struct msghdr *msg = from;
|
||||
|
||||
if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
|
||||
if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
|
||||
return -EINVAL;
|
||||
|
||||
/* Leave uarg NULL if can't zerocopy, callers should
|
||||
* be able to handle it.
|
||||
*/
|
||||
if ((rt->dst.dev->features & NETIF_F_SG) &&
|
||||
csummode == CHECKSUM_PARTIAL) {
|
||||
paged = true;
|
||||
zc = true;
|
||||
uarg = msg->msg_ubuf;
|
||||
}
|
||||
} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
|
||||
uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
|
||||
if (!uarg)
|
||||
return -ENOBUFS;
|
||||
extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
|
||||
if (rt->dst.dev->features & NETIF_F_SG &&
|
||||
csummode == CHECKSUM_PARTIAL) {
|
||||
paged = true;
|
||||
zc = true;
|
||||
} else {
|
||||
uarg->zerocopy = 0;
|
||||
skb_zcopy_set(skb, uarg, &extra_uref);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1630,9 +1649,12 @@ alloc_new_skb:
|
|||
(fraglen + alloc_extra < SKB_MAX_ALLOC ||
|
||||
!(rt->dst.dev->features & NETIF_F_SG)))
|
||||
alloclen = fraglen;
|
||||
else {
|
||||
else if (!zc) {
|
||||
alloclen = min_t(int, fraglen, MAX_HEADER);
|
||||
pagedlen = fraglen - alloclen;
|
||||
} else {
|
||||
alloclen = fragheaderlen + transhdrlen;
|
||||
pagedlen = datalen - transhdrlen;
|
||||
}
|
||||
alloclen += alloc_extra;
|
||||
|
||||
|
@ -1742,13 +1764,14 @@ alloc_new_skb:
|
|||
err = -EFAULT;
|
||||
goto error;
|
||||
}
|
||||
} else if (!uarg || !uarg->zerocopy) {
|
||||
} else if (!zc) {
|
||||
int i = skb_shinfo(skb)->nr_frags;
|
||||
|
||||
err = -ENOMEM;
|
||||
if (!sk_page_frag_refill(sk, pfrag))
|
||||
goto error;
|
||||
|
||||
skb_zcopy_downgrade_managed(skb);
|
||||
if (!skb_can_coalesce(skb, i, pfrag->page,
|
||||
pfrag->offset)) {
|
||||
err = -EMSGSIZE;
|
||||
|
|
|
@ -2106,6 +2106,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
|
|||
msg.msg_control = NULL;
|
||||
msg.msg_controllen = 0;
|
||||
msg.msg_namelen = 0;
|
||||
msg.msg_ubuf = NULL;
|
||||
if (addr) {
|
||||
err = move_addr_to_kernel(addr, addr_len, &address);
|
||||
if (err < 0)
|
||||
|
@ -2400,6 +2401,7 @@ int __copy_msghdr(struct msghdr *kmsg,
|
|||
return -EMSGSIZE;
|
||||
|
||||
kmsg->msg_iocb = NULL;
|
||||
kmsg->msg_ubuf = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -59,6 +59,7 @@ TEST_GEN_FILES += toeplitz
|
|||
TEST_GEN_FILES += cmsg_sender
|
||||
TEST_GEN_FILES += stress_reuseport_listen
|
||||
TEST_PROGS += test_vxlan_vnifiltering.sh
|
||||
TEST_GEN_FILES += io_uring_zerocopy_tx
|
||||
|
||||
TEST_FILES := settings
|
||||
|
||||
|
|
|
@ -0,0 +1,605 @@
|
|||
/* SPDX-License-Identifier: MIT */
|
||||
/* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <error.h>
|
||||
#include <fcntl.h>
|
||||
#include <limits.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <arpa/inet.h>
|
||||
#include <linux/errqueue.h>
|
||||
#include <linux/if_packet.h>
|
||||
#include <linux/io_uring.h>
|
||||
#include <linux/ipv6.h>
|
||||
#include <linux/socket.h>
|
||||
#include <linux/sockios.h>
|
||||
#include <net/ethernet.h>
|
||||
#include <net/if.h>
|
||||
#include <netinet/in.h>
|
||||
#include <netinet/ip.h>
|
||||
#include <netinet/ip6.h>
|
||||
#include <netinet/tcp.h>
|
||||
#include <netinet/udp.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/resource.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/un.h>
|
||||
#include <sys/wait.h>
|
||||
|
||||
#define NOTIF_TAG 0xfffffffULL
|
||||
#define NONZC_TAG 0
|
||||
#define ZC_TAG 1
|
||||
|
||||
enum {
|
||||
MODE_NONZC = 0,
|
||||
MODE_ZC = 1,
|
||||
MODE_ZC_FIXED = 2,
|
||||
MODE_MIXED = 3,
|
||||
};
|
||||
|
||||
static bool cfg_flush = false;
|
||||
static bool cfg_cork = false;
|
||||
static int cfg_mode = MODE_ZC_FIXED;
|
||||
static int cfg_nr_reqs = 8;
|
||||
static int cfg_family = PF_UNSPEC;
|
||||
static int cfg_payload_len;
|
||||
static int cfg_port = 8000;
|
||||
static int cfg_runtime_ms = 4200;
|
||||
|
||||
static socklen_t cfg_alen;
|
||||
static struct sockaddr_storage cfg_dst_addr;
|
||||
|
||||
static char payload[IP_MAXPACKET] __attribute__((aligned(4096)));
|
||||
|
||||
struct io_sq_ring {
|
||||
unsigned *head;
|
||||
unsigned *tail;
|
||||
unsigned *ring_mask;
|
||||
unsigned *ring_entries;
|
||||
unsigned *flags;
|
||||
unsigned *array;
|
||||
};
|
||||
|
||||
struct io_cq_ring {
|
||||
unsigned *head;
|
||||
unsigned *tail;
|
||||
unsigned *ring_mask;
|
||||
unsigned *ring_entries;
|
||||
struct io_uring_cqe *cqes;
|
||||
};
|
||||
|
||||
struct io_uring_sq {
|
||||
unsigned *khead;
|
||||
unsigned *ktail;
|
||||
unsigned *kring_mask;
|
||||
unsigned *kring_entries;
|
||||
unsigned *kflags;
|
||||
unsigned *kdropped;
|
||||
unsigned *array;
|
||||
struct io_uring_sqe *sqes;
|
||||
|
||||
unsigned sqe_head;
|
||||
unsigned sqe_tail;
|
||||
|
||||
size_t ring_sz;
|
||||
};
|
||||
|
||||
struct io_uring_cq {
|
||||
unsigned *khead;
|
||||
unsigned *ktail;
|
||||
unsigned *kring_mask;
|
||||
unsigned *kring_entries;
|
||||
unsigned *koverflow;
|
||||
struct io_uring_cqe *cqes;
|
||||
|
||||
size_t ring_sz;
|
||||
};
|
||||
|
||||
struct io_uring {
|
||||
struct io_uring_sq sq;
|
||||
struct io_uring_cq cq;
|
||||
int ring_fd;
|
||||
};
|
||||
|
||||
#ifdef __alpha__
|
||||
# ifndef __NR_io_uring_setup
|
||||
# define __NR_io_uring_setup 535
|
||||
# endif
|
||||
# ifndef __NR_io_uring_enter
|
||||
# define __NR_io_uring_enter 536
|
||||
# endif
|
||||
# ifndef __NR_io_uring_register
|
||||
# define __NR_io_uring_register 537
|
||||
# endif
|
||||
#else /* !__alpha__ */
|
||||
# ifndef __NR_io_uring_setup
|
||||
# define __NR_io_uring_setup 425
|
||||
# endif
|
||||
# ifndef __NR_io_uring_enter
|
||||
# define __NR_io_uring_enter 426
|
||||
# endif
|
||||
# ifndef __NR_io_uring_register
|
||||
# define __NR_io_uring_register 427
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#if defined(__x86_64) || defined(__i386__)
|
||||
#define read_barrier() __asm__ __volatile__("":::"memory")
|
||||
#define write_barrier() __asm__ __volatile__("":::"memory")
|
||||
#else
|
||||
|
||||
#define read_barrier() __sync_synchronize()
|
||||
#define write_barrier() __sync_synchronize()
|
||||
#endif
|
||||
|
||||
static int io_uring_setup(unsigned int entries, struct io_uring_params *p)
|
||||
{
|
||||
return syscall(__NR_io_uring_setup, entries, p);
|
||||
}
|
||||
|
||||
static int io_uring_enter(int fd, unsigned int to_submit,
|
||||
unsigned int min_complete,
|
||||
unsigned int flags, sigset_t *sig)
|
||||
{
|
||||
return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
|
||||
flags, sig, _NSIG / 8);
|
||||
}
|
||||
|
||||
static int io_uring_register_buffers(struct io_uring *ring,
|
||||
const struct iovec *iovecs,
|
||||
unsigned nr_iovecs)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = syscall(__NR_io_uring_register, ring->ring_fd,
|
||||
IORING_REGISTER_BUFFERS, iovecs, nr_iovecs);
|
||||
return (ret < 0) ? -errno : ret;
|
||||
}
|
||||
|
||||
static int io_uring_register_notifications(struct io_uring *ring,
|
||||
unsigned nr,
|
||||
struct io_uring_notification_slot *slots)
|
||||
{
|
||||
int ret;
|
||||
struct io_uring_notification_register r = {
|
||||
.nr_slots = nr,
|
||||
.data = (unsigned long)slots,
|
||||
};
|
||||
|
||||
ret = syscall(__NR_io_uring_register, ring->ring_fd,
|
||||
IORING_REGISTER_NOTIFIERS, &r, sizeof(r));
|
||||
return (ret < 0) ? -errno : ret;
|
||||
}
|
||||
|
||||
static int io_uring_mmap(int fd, struct io_uring_params *p,
|
||||
struct io_uring_sq *sq, struct io_uring_cq *cq)
|
||||
{
|
||||
size_t size;
|
||||
void *ptr;
|
||||
int ret;
|
||||
|
||||
sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
|
||||
ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
|
||||
if (ptr == MAP_FAILED)
|
||||
return -errno;
|
||||
sq->khead = ptr + p->sq_off.head;
|
||||
sq->ktail = ptr + p->sq_off.tail;
|
||||
sq->kring_mask = ptr + p->sq_off.ring_mask;
|
||||
sq->kring_entries = ptr + p->sq_off.ring_entries;
|
||||
sq->kflags = ptr + p->sq_off.flags;
|
||||
sq->kdropped = ptr + p->sq_off.dropped;
|
||||
sq->array = ptr + p->sq_off.array;
|
||||
|
||||
size = p->sq_entries * sizeof(struct io_uring_sqe);
|
||||
sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
|
||||
if (sq->sqes == MAP_FAILED) {
|
||||
ret = -errno;
|
||||
err:
|
||||
munmap(sq->khead, sq->ring_sz);
|
||||
return ret;
|
||||
}
|
||||
|
||||
cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
|
||||
ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
|
||||
MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
|
||||
if (ptr == MAP_FAILED) {
|
||||
ret = -errno;
|
||||
munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe));
|
||||
goto err;
|
||||
}
|
||||
cq->khead = ptr + p->cq_off.head;
|
||||
cq->ktail = ptr + p->cq_off.tail;
|
||||
cq->kring_mask = ptr + p->cq_off.ring_mask;
|
||||
cq->kring_entries = ptr + p->cq_off.ring_entries;
|
||||
cq->koverflow = ptr + p->cq_off.overflow;
|
||||
cq->cqes = ptr + p->cq_off.cqes;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int io_uring_queue_init(unsigned entries, struct io_uring *ring,
|
||||
unsigned flags)
|
||||
{
|
||||
struct io_uring_params p;
|
||||
int fd, ret;
|
||||
|
||||
memset(ring, 0, sizeof(*ring));
|
||||
memset(&p, 0, sizeof(p));
|
||||
p.flags = flags;
|
||||
|
||||
fd = io_uring_setup(entries, &p);
|
||||
if (fd < 0)
|
||||
return fd;
|
||||
ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq);
|
||||
if (!ret)
|
||||
ring->ring_fd = fd;
|
||||
else
|
||||
close(fd);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int io_uring_submit(struct io_uring *ring)
|
||||
{
|
||||
struct io_uring_sq *sq = &ring->sq;
|
||||
const unsigned mask = *sq->kring_mask;
|
||||
unsigned ktail, submitted, to_submit;
|
||||
int ret;
|
||||
|
||||
read_barrier();
|
||||
if (*sq->khead != *sq->ktail) {
|
||||
submitted = *sq->kring_entries;
|
||||
goto submit;
|
||||
}
|
||||
if (sq->sqe_head == sq->sqe_tail)
|
||||
return 0;
|
||||
|
||||
ktail = *sq->ktail;
|
||||
to_submit = sq->sqe_tail - sq->sqe_head;
|
||||
for (submitted = 0; submitted < to_submit; submitted++) {
|
||||
read_barrier();
|
||||
sq->array[ktail++ & mask] = sq->sqe_head++ & mask;
|
||||
}
|
||||
if (!submitted)
|
||||
return 0;
|
||||
|
||||
if (*sq->ktail != ktail) {
|
||||
write_barrier();
|
||||
*sq->ktail = ktail;
|
||||
write_barrier();
|
||||
}
|
||||
submit:
|
||||
ret = io_uring_enter(ring->ring_fd, submitted, 0,
|
||||
IORING_ENTER_GETEVENTS, NULL);
|
||||
return ret < 0 ? -errno : ret;
|
||||
}
|
||||
|
||||
static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd,
|
||||
const void *buf, size_t len, int flags)
|
||||
{
|
||||
memset(sqe, 0, sizeof(*sqe));
|
||||
sqe->opcode = (__u8) IORING_OP_SEND;
|
||||
sqe->fd = sockfd;
|
||||
sqe->addr = (unsigned long) buf;
|
||||
sqe->len = len;
|
||||
sqe->msg_flags = (__u32) flags;
|
||||
}
|
||||
|
||||
static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd,
|
||||
const void *buf, size_t len, int flags,
|
||||
unsigned slot_idx, unsigned zc_flags)
|
||||
{
|
||||
io_uring_prep_send(sqe, sockfd, buf, len, flags);
|
||||
sqe->opcode = (__u8) IORING_OP_SENDZC_NOTIF;
|
||||
sqe->notification_idx = slot_idx;
|
||||
sqe->ioprio = zc_flags;
|
||||
}
|
||||
|
||||
static struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
|
||||
{
|
||||
struct io_uring_sq *sq = &ring->sq;
|
||||
|
||||
if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries)
|
||||
return NULL;
|
||||
return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask];
|
||||
}
|
||||
|
||||
static int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
|
||||
{
|
||||
struct io_uring_cq *cq = &ring->cq;
|
||||
const unsigned mask = *cq->kring_mask;
|
||||
unsigned head = *cq->khead;
|
||||
int ret;
|
||||
|
||||
*cqe_ptr = NULL;
|
||||
do {
|
||||
read_barrier();
|
||||
if (head != *cq->ktail) {
|
||||
*cqe_ptr = &cq->cqes[head & mask];
|
||||
break;
|
||||
}
|
||||
ret = io_uring_enter(ring->ring_fd, 0, 1,
|
||||
IORING_ENTER_GETEVENTS, NULL);
|
||||
if (ret < 0)
|
||||
return -errno;
|
||||
} while (1);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void io_uring_cqe_seen(struct io_uring *ring)
|
||||
{
|
||||
*(&ring->cq)->khead += 1;
|
||||
write_barrier();
|
||||
}
|
||||
|
||||
static unsigned long gettimeofday_ms(void)
|
||||
{
|
||||
struct timeval tv;
|
||||
|
||||
gettimeofday(&tv, NULL);
|
||||
return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
|
||||
}
|
||||
|
||||
static void do_setsockopt(int fd, int level, int optname, int val)
|
||||
{
|
||||
if (setsockopt(fd, level, optname, &val, sizeof(val)))
|
||||
error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
|
||||
}
|
||||
|
||||
static int do_setup_tx(int domain, int type, int protocol)
|
||||
{
|
||||
int fd;
|
||||
|
||||
fd = socket(domain, type, protocol);
|
||||
if (fd == -1)
|
||||
error(1, errno, "socket t");
|
||||
|
||||
do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
|
||||
|
||||
if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
|
||||
error(1, errno, "connect");
|
||||
return fd;
|
||||
}
|
||||
|
||||
static void do_tx(int domain, int type, int protocol)
|
||||
{
|
||||
struct io_uring_notification_slot b[1] = {{.tag = NOTIF_TAG}};
|
||||
struct io_uring_sqe *sqe;
|
||||
struct io_uring_cqe *cqe;
|
||||
unsigned long packets = 0, bytes = 0;
|
||||
struct io_uring ring;
|
||||
struct iovec iov;
|
||||
uint64_t tstop;
|
||||
int i, fd, ret;
|
||||
int compl_cqes = 0;
|
||||
|
||||
fd = do_setup_tx(domain, type, protocol);
|
||||
|
||||
ret = io_uring_queue_init(512, &ring, 0);
|
||||
if (ret)
|
||||
error(1, ret, "io_uring: queue init");
|
||||
|
||||
ret = io_uring_register_notifications(&ring, 1, b);
|
||||
if (ret)
|
||||
error(1, ret, "io_uring: tx ctx registration");
|
||||
|
||||
iov.iov_base = payload;
|
||||
iov.iov_len = cfg_payload_len;
|
||||
|
||||
ret = io_uring_register_buffers(&ring, &iov, 1);
|
||||
if (ret)
|
||||
error(1, ret, "io_uring: buffer registration");
|
||||
|
||||
tstop = gettimeofday_ms() + cfg_runtime_ms;
|
||||
do {
|
||||
if (cfg_cork)
|
||||
do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
|
||||
|
||||
for (i = 0; i < cfg_nr_reqs; i++) {
|
||||
unsigned zc_flags = 0;
|
||||
unsigned buf_idx = 0;
|
||||
unsigned slot_idx = 0;
|
||||
unsigned mode = cfg_mode;
|
||||
unsigned msg_flags = 0;
|
||||
|
||||
if (cfg_mode == MODE_MIXED)
|
||||
mode = rand() % 3;
|
||||
|
||||
sqe = io_uring_get_sqe(&ring);
|
||||
|
||||
if (mode == MODE_NONZC) {
|
||||
io_uring_prep_send(sqe, fd, payload,
|
||||
cfg_payload_len, msg_flags);
|
||||
sqe->user_data = NONZC_TAG;
|
||||
} else {
|
||||
if (cfg_flush) {
|
||||
zc_flags |= IORING_RECVSEND_NOTIF_FLUSH;
|
||||
compl_cqes++;
|
||||
}
|
||||
io_uring_prep_sendzc(sqe, fd, payload,
|
||||
cfg_payload_len,
|
||||
msg_flags, slot_idx, zc_flags);
|
||||
if (mode == MODE_ZC_FIXED) {
|
||||
sqe->ioprio |= IORING_RECVSEND_FIXED_BUF;
|
||||
sqe->buf_index = buf_idx;
|
||||
}
|
||||
sqe->user_data = ZC_TAG;
|
||||
}
|
||||
}
|
||||
|
||||
ret = io_uring_submit(&ring);
|
||||
if (ret != cfg_nr_reqs)
|
||||
error(1, ret, "submit");
|
||||
|
||||
for (i = 0; i < cfg_nr_reqs; i++) {
|
||||
ret = io_uring_wait_cqe(&ring, &cqe);
|
||||
if (ret)
|
||||
error(1, ret, "wait cqe");
|
||||
|
||||
if (cqe->user_data == NOTIF_TAG) {
|
||||
compl_cqes--;
|
||||
i--;
|
||||
} else if (cqe->user_data != NONZC_TAG &&
|
||||
cqe->user_data != ZC_TAG) {
|
||||
error(1, cqe->res, "invalid user_data");
|
||||
} else if (cqe->res <= 0 && cqe->res != -EAGAIN) {
|
||||
error(1, cqe->res, "send failed");
|
||||
} else {
|
||||
if (cqe->res > 0) {
|
||||
packets++;
|
||||
bytes += cqe->res;
|
||||
}
|
||||
/* failed requests don't flush */
|
||||
if (cfg_flush &&
|
||||
cqe->res <= 0 &&
|
||||
cqe->user_data == ZC_TAG)
|
||||
compl_cqes--;
|
||||
}
|
||||
io_uring_cqe_seen(&ring);
|
||||
}
|
||||
if (cfg_cork)
|
||||
do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
|
||||
} while (gettimeofday_ms() < tstop);
|
||||
|
||||
if (close(fd))
|
||||
error(1, errno, "close");
|
||||
|
||||
fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n",
|
||||
packets, bytes >> 20,
|
||||
packets / (cfg_runtime_ms / 1000),
|
||||
(bytes >> 20) / (cfg_runtime_ms / 1000));
|
||||
|
||||
while (compl_cqes) {
|
||||
ret = io_uring_wait_cqe(&ring, &cqe);
|
||||
if (ret)
|
||||
error(1, ret, "wait cqe");
|
||||
io_uring_cqe_seen(&ring);
|
||||
compl_cqes--;
|
||||
}
|
||||
}
|
||||
|
||||
static void do_test(int domain, int type, int protocol)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < IP_MAXPACKET; i++)
|
||||
payload[i] = 'a' + (i % 26);
|
||||
do_tx(domain, type, protocol);
|
||||
}
|
||||
|
||||
static void usage(const char *filepath)
|
||||
{
|
||||
error(1, 0, "Usage: %s [-f] [-n<N>] [-z0] [-s<payload size>] "
|
||||
"(-4|-6) [-t<time s>] -D<dst_ip> udp", filepath);
|
||||
}
|
||||
|
||||
static void parse_opts(int argc, char **argv)
|
||||
{
|
||||
const int max_payload_len = sizeof(payload) -
|
||||
sizeof(struct ipv6hdr) -
|
||||
sizeof(struct tcphdr) -
|
||||
40 /* max tcp options */;
|
||||
struct sockaddr_in6 *addr6 = (void *) &cfg_dst_addr;
|
||||
struct sockaddr_in *addr4 = (void *) &cfg_dst_addr;
|
||||
char *daddr = NULL;
|
||||
int c;
|
||||
|
||||
if (argc <= 1)
|
||||
usage(argv[0]);
|
||||
cfg_payload_len = max_payload_len;
|
||||
|
||||
while ((c = getopt(argc, argv, "46D:p:s:t:n:fc:m:")) != -1) {
|
||||
switch (c) {
|
||||
case '4':
|
||||
if (cfg_family != PF_UNSPEC)
|
||||
error(1, 0, "Pass one of -4 or -6");
|
||||
cfg_family = PF_INET;
|
||||
cfg_alen = sizeof(struct sockaddr_in);
|
||||
break;
|
||||
case '6':
|
||||
if (cfg_family != PF_UNSPEC)
|
||||
error(1, 0, "Pass one of -4 or -6");
|
||||
cfg_family = PF_INET6;
|
||||
cfg_alen = sizeof(struct sockaddr_in6);
|
||||
break;
|
||||
case 'D':
|
||||
daddr = optarg;
|
||||
break;
|
||||
case 'p':
|
||||
cfg_port = strtoul(optarg, NULL, 0);
|
||||
break;
|
||||
case 's':
|
||||
cfg_payload_len = strtoul(optarg, NULL, 0);
|
||||
break;
|
||||
case 't':
|
||||
cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
|
||||
break;
|
||||
case 'n':
|
||||
cfg_nr_reqs = strtoul(optarg, NULL, 0);
|
||||
break;
|
||||
case 'f':
|
||||
cfg_flush = 1;
|
||||
break;
|
||||
case 'c':
|
||||
cfg_cork = strtol(optarg, NULL, 0);
|
||||
break;
|
||||
case 'm':
|
||||
cfg_mode = strtol(optarg, NULL, 0);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch (cfg_family) {
|
||||
case PF_INET:
|
||||
memset(addr4, 0, sizeof(*addr4));
|
||||
addr4->sin_family = AF_INET;
|
||||
addr4->sin_port = htons(cfg_port);
|
||||
if (daddr &&
|
||||
inet_pton(AF_INET, daddr, &(addr4->sin_addr)) != 1)
|
||||
error(1, 0, "ipv4 parse error: %s", daddr);
|
||||
break;
|
||||
case PF_INET6:
|
||||
memset(addr6, 0, sizeof(*addr6));
|
||||
addr6->sin6_family = AF_INET6;
|
||||
addr6->sin6_port = htons(cfg_port);
|
||||
if (daddr &&
|
||||
inet_pton(AF_INET6, daddr, &(addr6->sin6_addr)) != 1)
|
||||
error(1, 0, "ipv6 parse error: %s", daddr);
|
||||
break;
|
||||
default:
|
||||
error(1, 0, "illegal domain");
|
||||
}
|
||||
|
||||
if (cfg_payload_len > max_payload_len)
|
||||
error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
|
||||
if (cfg_mode == MODE_NONZC && cfg_flush)
|
||||
error(1, 0, "-f: only zerocopy modes support notifications");
|
||||
if (optind != argc - 1)
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
const char *cfg_test = argv[argc - 1];
|
||||
|
||||
parse_opts(argc, argv);
|
||||
|
||||
if (!strcmp(cfg_test, "tcp"))
|
||||
do_test(cfg_family, SOCK_STREAM, 0);
|
||||
else if (!strcmp(cfg_test, "udp"))
|
||||
do_test(cfg_family, SOCK_DGRAM, 0);
|
||||
else
|
||||
error(1, 0, "unknown cfg_test %s", cfg_test);
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,131 @@
|
|||
#!/bin/bash
|
||||
#
|
||||
# Send data between two processes across namespaces
|
||||
# Run twice: once without and once with zerocopy
|
||||
|
||||
set -e
|
||||
|
||||
readonly DEV="veth0"
|
||||
readonly DEV_MTU=65535
|
||||
readonly BIN_TX="./io_uring_zerocopy_tx"
|
||||
readonly BIN_RX="./msg_zerocopy"
|
||||
|
||||
readonly RAND="$(mktemp -u XXXXXX)"
|
||||
readonly NSPREFIX="ns-${RAND}"
|
||||
readonly NS1="${NSPREFIX}1"
|
||||
readonly NS2="${NSPREFIX}2"
|
||||
|
||||
readonly SADDR4='192.168.1.1'
|
||||
readonly DADDR4='192.168.1.2'
|
||||
readonly SADDR6='fd::1'
|
||||
readonly DADDR6='fd::2'
|
||||
|
||||
readonly path_sysctl_mem="net.core.optmem_max"
|
||||
|
||||
# No arguments: automated test
|
||||
if [[ "$#" -eq "0" ]]; then
|
||||
IPs=( "4" "6" )
|
||||
protocols=( "tcp" "udp" )
|
||||
|
||||
for IP in "${IPs[@]}"; do
|
||||
for proto in "${protocols[@]}"; do
|
||||
for mode in $(seq 1 3); do
|
||||
$0 "$IP" "$proto" -m "$mode" -t 1 -n 32
|
||||
$0 "$IP" "$proto" -m "$mode" -t 1 -n 32 -f
|
||||
$0 "$IP" "$proto" -m "$mode" -t 1 -n 32 -c -f
|
||||
done
|
||||
done
|
||||
done
|
||||
|
||||
echo "OK. All tests passed"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Argument parsing
|
||||
if [[ "$#" -lt "2" ]]; then
|
||||
echo "Usage: $0 [4|6] [tcp|udp|raw|raw_hdrincl|packet|packet_dgram] <args>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
readonly IP="$1"
|
||||
shift
|
||||
readonly TXMODE="$1"
|
||||
shift
|
||||
readonly EXTRA_ARGS="$@"
|
||||
|
||||
# Argument parsing: configure addresses
|
||||
if [[ "${IP}" == "4" ]]; then
|
||||
readonly SADDR="${SADDR4}"
|
||||
readonly DADDR="${DADDR4}"
|
||||
elif [[ "${IP}" == "6" ]]; then
|
||||
readonly SADDR="${SADDR6}"
|
||||
readonly DADDR="${DADDR6}"
|
||||
else
|
||||
echo "Invalid IP version ${IP}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Argument parsing: select receive mode
|
||||
#
|
||||
# This differs from send mode for
|
||||
# - packet: use raw recv, because packet receives skb clones
|
||||
# - raw_hdrinc: use raw recv, because hdrincl is a tx-only option
|
||||
case "${TXMODE}" in
|
||||
'packet' | 'packet_dgram' | 'raw_hdrincl')
|
||||
RXMODE='raw'
|
||||
;;
|
||||
*)
|
||||
RXMODE="${TXMODE}"
|
||||
;;
|
||||
esac
|
||||
|
||||
# Start of state changes: install cleanup handler
|
||||
save_sysctl_mem="$(sysctl -n ${path_sysctl_mem})"
|
||||
|
||||
cleanup() {
|
||||
ip netns del "${NS2}"
|
||||
ip netns del "${NS1}"
|
||||
sysctl -w -q "${path_sysctl_mem}=${save_sysctl_mem}"
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
# Configure system settings
|
||||
sysctl -w -q "${path_sysctl_mem}=1000000"
|
||||
|
||||
# Create virtual ethernet pair between network namespaces
|
||||
ip netns add "${NS1}"
|
||||
ip netns add "${NS2}"
|
||||
|
||||
ip link add "${DEV}" mtu "${DEV_MTU}" netns "${NS1}" type veth \
|
||||
peer name "${DEV}" mtu "${DEV_MTU}" netns "${NS2}"
|
||||
|
||||
# Bring the devices up
|
||||
ip -netns "${NS1}" link set "${DEV}" up
|
||||
ip -netns "${NS2}" link set "${DEV}" up
|
||||
|
||||
# Set fixed MAC addresses on the devices
|
||||
ip -netns "${NS1}" link set dev "${DEV}" address 02:02:02:02:02:02
|
||||
ip -netns "${NS2}" link set dev "${DEV}" address 06:06:06:06:06:06
|
||||
|
||||
# Add fixed IP addresses to the devices
|
||||
ip -netns "${NS1}" addr add 192.168.1.1/24 dev "${DEV}"
|
||||
ip -netns "${NS2}" addr add 192.168.1.2/24 dev "${DEV}"
|
||||
ip -netns "${NS1}" addr add fd::1/64 dev "${DEV}" nodad
|
||||
ip -netns "${NS2}" addr add fd::2/64 dev "${DEV}" nodad
|
||||
|
||||
# Optionally disable sg or csum offload to test edge cases
|
||||
# ip netns exec "${NS1}" ethtool -K "${DEV}" sg off
|
||||
|
||||
do_test() {
|
||||
local readonly ARGS="$1"
|
||||
|
||||
echo "ipv${IP} ${TXMODE} ${ARGS}"
|
||||
ip netns exec "${NS2}" "${BIN_RX}" "-${IP}" -t 2 -C 2 -S "${SADDR}" -D "${DADDR}" -r "${RXMODE}" &
|
||||
sleep 0.2
|
||||
ip netns exec "${NS1}" "${BIN_TX}" "-${IP}" -t 1 -D "${DADDR}" ${ARGS} "${TXMODE}"
|
||||
wait
|
||||
}
|
||||
|
||||
do_test "${EXTRA_ARGS}"
|
||||
echo ok
|
Загрузка…
Ссылка в новой задаче