2018-05-02 14:01:23 +03:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/* XDP user-space packet buffer
|
|
|
|
* Copyright(c) 2018 Intel Corporation.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/init.h>
|
|
|
|
#include <linux/sched/mm.h>
|
|
|
|
#include <linux/sched/signal.h>
|
|
|
|
#include <linux/sched/task.h>
|
|
|
|
#include <linux/uaccess.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/bpf.h>
|
|
|
|
#include <linux/mm.h>
|
2018-07-31 06:43:53 +03:00
|
|
|
#include <linux/netdevice.h>
|
|
|
|
#include <linux/rtnetlink.h>
|
2019-01-24 21:59:38 +03:00
|
|
|
#include <linux/idr.h>
|
2018-05-02 14:01:23 +03:00
|
|
|
|
|
|
|
#include "xdp_umem.h"
|
2018-06-04 15:05:51 +03:00
|
|
|
#include "xsk_queue.h"
|
2018-05-02 14:01:23 +03:00
|
|
|
|
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 14:57:13 +03:00
|
|
|
#define XDP_UMEM_MIN_CHUNK_SIZE 2048
|
2018-05-02 14:01:23 +03:00
|
|
|
|
2019-01-24 21:59:38 +03:00
|
|
|
static DEFINE_IDA(umem_ida);
|
|
|
|
|
2018-06-04 15:05:57 +03:00
|
|
|
void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
spin_lock_irqsave(&umem->xsk_list_lock, flags);
|
|
|
|
list_add_rcu(&xs->list, &umem->xsk_list);
|
|
|
|
spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
|
2018-10-05 14:25:15 +03:00
|
|
|
spin_lock_irqsave(&umem->xsk_list_lock, flags);
|
|
|
|
list_del_rcu(&xs->list);
|
|
|
|
spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
|
2018-06-04 15:05:57 +03:00
|
|
|
}
|
|
|
|
|
2018-10-01 15:51:34 +03:00
|
|
|
/* The umem is stored both in the _rx struct and the _tx struct as we do
|
|
|
|
* not know if the device has more tx queues than rx, or the opposite.
|
|
|
|
* This might also change during run time.
|
|
|
|
*/
|
2019-01-10 22:29:02 +03:00
|
|
|
static int xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem,
|
|
|
|
u16 queue_id)
|
2018-07-31 06:43:53 +03:00
|
|
|
{
|
2019-01-10 22:29:02 +03:00
|
|
|
if (queue_id >= max_t(unsigned int,
|
|
|
|
dev->real_num_rx_queues,
|
|
|
|
dev->real_num_tx_queues))
|
|
|
|
return -EINVAL;
|
|
|
|
|
2018-10-01 15:51:34 +03:00
|
|
|
if (queue_id < dev->real_num_rx_queues)
|
|
|
|
dev->_rx[queue_id].umem = umem;
|
|
|
|
if (queue_id < dev->real_num_tx_queues)
|
|
|
|
dev->_tx[queue_id].umem = umem;
|
2019-01-10 22:29:02 +03:00
|
|
|
|
|
|
|
return 0;
|
2018-10-01 15:51:34 +03:00
|
|
|
}
|
2018-07-31 06:43:53 +03:00
|
|
|
|
2018-10-01 15:51:36 +03:00
|
|
|
struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
|
|
|
|
u16 queue_id)
|
2018-10-01 15:51:34 +03:00
|
|
|
{
|
|
|
|
if (queue_id < dev->real_num_rx_queues)
|
|
|
|
return dev->_rx[queue_id].umem;
|
|
|
|
if (queue_id < dev->real_num_tx_queues)
|
|
|
|
return dev->_tx[queue_id].umem;
|
2018-07-31 06:43:53 +03:00
|
|
|
|
2018-10-01 15:51:34 +03:00
|
|
|
return NULL;
|
|
|
|
}
|
2018-12-18 16:45:13 +03:00
|
|
|
EXPORT_SYMBOL(xdp_get_umem_from_qid);
|
2018-07-31 06:43:53 +03:00
|
|
|
|
2018-10-01 15:51:34 +03:00
|
|
|
static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
|
|
|
|
{
|
2018-10-01 15:51:37 +03:00
|
|
|
if (queue_id < dev->real_num_rx_queues)
|
2018-10-01 15:51:34 +03:00
|
|
|
dev->_rx[queue_id].umem = NULL;
|
2018-10-01 15:51:37 +03:00
|
|
|
if (queue_id < dev->real_num_tx_queues)
|
2018-10-01 15:51:34 +03:00
|
|
|
dev->_tx[queue_id].umem = NULL;
|
2018-07-31 06:43:53 +03:00
|
|
|
}
|
|
|
|
|
2018-06-04 15:05:55 +03:00
|
|
|
int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
|
2018-10-01 15:51:34 +03:00
|
|
|
u16 queue_id, u16 flags)
|
2018-06-04 15:05:55 +03:00
|
|
|
{
|
|
|
|
bool force_zc, force_copy;
|
|
|
|
struct netdev_bpf bpf;
|
2018-10-01 15:51:34 +03:00
|
|
|
int err = 0;
|
2018-06-04 15:05:55 +03:00
|
|
|
|
|
|
|
force_zc = flags & XDP_ZEROCOPY;
|
|
|
|
force_copy = flags & XDP_COPY;
|
|
|
|
|
|
|
|
if (force_zc && force_copy)
|
|
|
|
return -EINVAL;
|
|
|
|
|
2018-10-01 15:51:34 +03:00
|
|
|
rtnl_lock();
|
|
|
|
if (xdp_get_umem_from_qid(dev, queue_id)) {
|
|
|
|
err = -EBUSY;
|
|
|
|
goto out_rtnl_unlock;
|
|
|
|
}
|
2018-06-04 15:05:55 +03:00
|
|
|
|
2019-01-10 22:29:02 +03:00
|
|
|
err = xdp_reg_umem_at_qid(dev, umem, queue_id);
|
|
|
|
if (err)
|
|
|
|
goto out_rtnl_unlock;
|
|
|
|
|
2018-10-01 15:51:34 +03:00
|
|
|
umem->dev = dev;
|
|
|
|
umem->queue_id = queue_id;
|
|
|
|
if (force_copy)
|
|
|
|
/* For copy-mode, we are done. */
|
|
|
|
goto out_rtnl_unlock;
|
2018-06-04 15:05:55 +03:00
|
|
|
|
2018-10-01 15:51:34 +03:00
|
|
|
if (!dev->netdev_ops->ndo_bpf ||
|
|
|
|
!dev->netdev_ops->ndo_xsk_async_xmit) {
|
|
|
|
err = -EOPNOTSUPP;
|
|
|
|
goto err_unreg_umem;
|
2018-07-31 06:43:53 +03:00
|
|
|
}
|
2018-06-04 15:05:55 +03:00
|
|
|
|
2018-07-31 06:43:52 +03:00
|
|
|
bpf.command = XDP_SETUP_XSK_UMEM;
|
|
|
|
bpf.xsk.umem = umem;
|
|
|
|
bpf.xsk.queue_id = queue_id;
|
2018-06-04 15:05:55 +03:00
|
|
|
|
2018-07-31 06:43:52 +03:00
|
|
|
err = dev->netdev_ops->ndo_bpf(dev, &bpf);
|
|
|
|
if (err)
|
2018-10-01 15:51:34 +03:00
|
|
|
goto err_unreg_umem;
|
2018-07-31 06:43:53 +03:00
|
|
|
rtnl_unlock();
|
2018-06-04 15:05:55 +03:00
|
|
|
|
2018-07-31 06:43:52 +03:00
|
|
|
dev_hold(dev);
|
|
|
|
umem->zc = true;
|
|
|
|
return 0;
|
2018-07-31 06:43:53 +03:00
|
|
|
|
2018-10-01 15:51:34 +03:00
|
|
|
err_unreg_umem:
|
|
|
|
if (!force_zc)
|
|
|
|
err = 0; /* fallback to copy mode */
|
2019-02-12 10:51:14 +03:00
|
|
|
if (err)
|
|
|
|
xdp_clear_umem_at_qid(dev, queue_id);
|
2018-10-01 15:51:34 +03:00
|
|
|
out_rtnl_unlock:
|
2018-07-31 06:43:53 +03:00
|
|
|
rtnl_unlock();
|
2018-10-01 15:51:34 +03:00
|
|
|
return err;
|
2018-06-04 15:05:55 +03:00
|
|
|
}
|
|
|
|
|
2018-06-04 15:05:57 +03:00
|
|
|
static void xdp_umem_clear_dev(struct xdp_umem *umem)
|
2018-06-04 15:05:55 +03:00
|
|
|
{
|
|
|
|
struct netdev_bpf bpf;
|
|
|
|
int err;
|
|
|
|
|
2018-10-01 15:51:34 +03:00
|
|
|
if (umem->zc) {
|
2018-06-04 15:05:55 +03:00
|
|
|
bpf.command = XDP_SETUP_XSK_UMEM;
|
|
|
|
bpf.xsk.umem = NULL;
|
|
|
|
bpf.xsk.queue_id = umem->queue_id;
|
|
|
|
|
|
|
|
rtnl_lock();
|
|
|
|
err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
|
|
|
|
rtnl_unlock();
|
|
|
|
|
|
|
|
if (err)
|
|
|
|
WARN(1, "failed to disable umem!\n");
|
2018-10-01 15:51:34 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (umem->dev) {
|
|
|
|
rtnl_lock();
|
|
|
|
xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
|
|
|
|
rtnl_unlock();
|
|
|
|
}
|
2018-06-04 15:05:55 +03:00
|
|
|
|
2018-10-01 15:51:34 +03:00
|
|
|
if (umem->zc) {
|
2018-06-04 15:05:55 +03:00
|
|
|
dev_put(umem->dev);
|
2018-10-01 15:51:34 +03:00
|
|
|
umem->zc = false;
|
2018-06-04 15:05:55 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-05-02 14:01:23 +03:00
|
|
|
static void xdp_umem_unpin_pages(struct xdp_umem *umem)
|
|
|
|
{
|
|
|
|
unsigned int i;
|
|
|
|
|
2018-05-22 10:35:02 +03:00
|
|
|
for (i = 0; i < umem->npgs; i++) {
|
|
|
|
struct page *page = umem->pgs[i];
|
2018-05-02 14:01:23 +03:00
|
|
|
|
2018-05-22 10:35:02 +03:00
|
|
|
set_page_dirty_lock(page);
|
|
|
|
put_page(page);
|
2018-05-02 14:01:23 +03:00
|
|
|
}
|
2018-05-22 10:35:02 +03:00
|
|
|
|
|
|
|
kfree(umem->pgs);
|
|
|
|
umem->pgs = NULL;
|
2018-05-02 14:01:23 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
|
|
|
|
{
|
2018-06-08 01:06:01 +03:00
|
|
|
if (umem->user) {
|
|
|
|
atomic_long_sub(umem->npgs, &umem->user->locked_vm);
|
|
|
|
free_uid(umem->user);
|
|
|
|
}
|
2018-05-02 14:01:23 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static void xdp_umem_release(struct xdp_umem *umem)
|
|
|
|
{
|
2018-06-04 15:05:55 +03:00
|
|
|
xdp_umem_clear_dev(umem);
|
|
|
|
|
2019-01-24 21:59:38 +03:00
|
|
|
ida_simple_remove(&umem_ida, umem->id);
|
|
|
|
|
2018-05-02 14:01:24 +03:00
|
|
|
if (umem->fq) {
|
|
|
|
xskq_destroy(umem->fq);
|
|
|
|
umem->fq = NULL;
|
|
|
|
}
|
|
|
|
|
2018-05-02 14:01:31 +03:00
|
|
|
if (umem->cq) {
|
|
|
|
xskq_destroy(umem->cq);
|
|
|
|
umem->cq = NULL;
|
|
|
|
}
|
|
|
|
|
2018-09-07 11:18:46 +03:00
|
|
|
xsk_reuseq_destroy(umem);
|
|
|
|
|
2018-05-22 10:35:02 +03:00
|
|
|
xdp_umem_unpin_pages(umem);
|
2018-05-02 14:01:23 +03:00
|
|
|
|
2018-06-04 15:05:52 +03:00
|
|
|
kfree(umem->pages);
|
|
|
|
umem->pages = NULL;
|
|
|
|
|
2018-05-02 14:01:23 +03:00
|
|
|
xdp_umem_unaccount_pages(umem);
|
|
|
|
kfree(umem);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void xdp_umem_release_deferred(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
|
|
|
|
|
|
|
|
xdp_umem_release(umem);
|
|
|
|
}
|
|
|
|
|
|
|
|
void xdp_get_umem(struct xdp_umem *umem)
|
|
|
|
{
|
2018-05-22 10:35:03 +03:00
|
|
|
refcount_inc(&umem->users);
|
2018-05-02 14:01:23 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void xdp_put_umem(struct xdp_umem *umem)
|
|
|
|
{
|
|
|
|
if (!umem)
|
|
|
|
return;
|
|
|
|
|
2018-05-22 10:35:03 +03:00
|
|
|
if (refcount_dec_and_test(&umem->users)) {
|
2018-05-02 14:01:23 +03:00
|
|
|
INIT_WORK(&umem->work, xdp_umem_release_deferred);
|
|
|
|
schedule_work(&umem->work);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int xdp_umem_pin_pages(struct xdp_umem *umem)
|
|
|
|
{
|
|
|
|
unsigned int gup_flags = FOLL_WRITE;
|
|
|
|
long npgs;
|
|
|
|
int err;
|
|
|
|
|
2018-06-11 14:57:12 +03:00
|
|
|
umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs),
|
|
|
|
GFP_KERNEL | __GFP_NOWARN);
|
2018-05-02 14:01:23 +03:00
|
|
|
if (!umem->pgs)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2019-02-11 19:15:29 +03:00
|
|
|
down_read(¤t->mm->mmap_sem);
|
mm/gup: replace get_user_pages_longterm() with FOLL_LONGTERM
Pach series "Add FOLL_LONGTERM to GUP fast and use it".
HFI1, qib, and mthca, use get_user_pages_fast() due to its performance
advantages. These pages can be held for a significant time. But
get_user_pages_fast() does not protect against mapping FS DAX pages.
Introduce FOLL_LONGTERM and use this flag in get_user_pages_fast() which
retains the performance while also adding the FS DAX checks. XDP has also
shown interest in using this functionality.[1]
In addition we change get_user_pages() to use the new FOLL_LONGTERM flag
and remove the specialized get_user_pages_longterm call.
[1] https://lkml.org/lkml/2019/3/19/939
"longterm" is a relative thing and at this point is probably a misnomer.
This is really flagging a pin which is going to be given to hardware and
can't move. I've thought of a couple of alternative names but I think we
have to settle on if we are going to use FL_LAYOUT or something else to
solve the "longterm" problem. Then I think we can change the flag to a
better name.
Secondly, it depends on how often you are registering memory. I have
spoken with some RDMA users who consider MR in the performance path...
For the overall application performance. I don't have the numbers as the
tests for HFI1 were done a long time ago. But there was a significant
advantage. Some of which is probably due to the fact that you don't have
to hold mmap_sem.
Finally, architecturally I think it would be good for everyone to use
*_fast. There are patches submitted to the RDMA list which would allow
the use of *_fast (they reworking the use of mmap_sem) and as soon as they
are accepted I'll submit a patch to convert the RDMA core as well. Also
to this point others are looking to use *_fast.
As an aside, Jasons pointed out in my previous submission that *_fast and
*_unlocked look very much the same. I agree and I think further cleanup
will be coming. But I'm focused on getting the final solution for DAX at
the moment.
This patch (of 7):
This patch starts a series which aims to support FOLL_LONGTERM in
get_user_pages_fast(). Some callers who would like to do a longterm (user
controlled pin) of pages with the fast variant of GUP for performance
purposes.
Rather than have a separate get_user_pages_longterm() call, introduce
FOLL_LONGTERM and change the longterm callers to use it.
This patch does not change any functionality. In the short term
"longterm" or user controlled pins are unsafe for Filesystems and FS DAX
in particular has been blocked. However, callers of get_user_pages_fast()
were not "protected".
FOLL_LONGTERM can _only_ be supported with get_user_pages[_fast]() as it
requires vmas to determine if DAX is in use.
NOTE: In merging with the CMA changes we opt to change the
get_user_pages() call in check_and_migrate_cma_pages() to a call of
__get_user_pages_locked() on the newly migrated pages. This makes the
code read better in that we are calling __get_user_pages_locked() on the
pages before and after a potential migration.
As a side affect some of the interfaces are cleaned up but this is not the
primary purpose of the series.
In review[1] it was asked:
<quote>
> This I don't get - if you do lock down long term mappings performance
> of the actual get_user_pages call shouldn't matter to start with.
>
> What do I miss?
A couple of points.
First "longterm" is a relative thing and at this point is probably a
misnomer. This is really flagging a pin which is going to be given to
hardware and can't move. I've thought of a couple of alternative names
but I think we have to settle on if we are going to use FL_LAYOUT or
something else to solve the "longterm" problem. Then I think we can
change the flag to a better name.
Second, It depends on how often you are registering memory. I have spoken
with some RDMA users who consider MR in the performance path... For the
overall application performance. I don't have the numbers as the tests
for HFI1 were done a long time ago. But there was a significant
advantage. Some of which is probably due to the fact that you don't have
to hold mmap_sem.
Finally, architecturally I think it would be good for everyone to use
*_fast. There are patches submitted to the RDMA list which would allow
the use of *_fast (they reworking the use of mmap_sem) and as soon as they
are accepted I'll submit a patch to convert the RDMA core as well. Also
to this point others are looking to use *_fast.
As an asside, Jasons pointed out in my previous submission that *_fast and
*_unlocked look very much the same. I agree and I think further cleanup
will be coming. But I'm focused on getting the final solution for DAX at
the moment.
</quote>
[1] https://lore.kernel.org/lkml/20190220180255.GA12020@iweiny-DESK2.sc.intel.com/T/#md6abad2569f3bf6c1f03686c8097ab6563e94965
[ira.weiny@intel.com: v3]
Link: http://lkml.kernel.org/r/20190328084422.29911-2-ira.weiny@intel.com
Link: http://lkml.kernel.org/r/20190328084422.29911-2-ira.weiny@intel.com
Link: http://lkml.kernel.org/r/20190317183438.2057-2-ira.weiny@intel.com
Signed-off-by: Ira Weiny <ira.weiny@intel.com>
Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Rich Felker <dalias@libc.org>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Mike Marshall <hubcap@omnibond.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2019-05-14 03:17:03 +03:00
|
|
|
npgs = get_user_pages(umem->address, umem->npgs,
|
|
|
|
gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);
|
2019-02-11 19:15:29 +03:00
|
|
|
up_read(¤t->mm->mmap_sem);
|
2018-05-02 14:01:23 +03:00
|
|
|
|
|
|
|
if (npgs != umem->npgs) {
|
|
|
|
if (npgs >= 0) {
|
|
|
|
umem->npgs = npgs;
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out_pin;
|
|
|
|
}
|
|
|
|
err = npgs;
|
|
|
|
goto out_pgs;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_pin:
|
|
|
|
xdp_umem_unpin_pages(umem);
|
|
|
|
out_pgs:
|
|
|
|
kfree(umem->pgs);
|
|
|
|
umem->pgs = NULL;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int xdp_umem_account_pages(struct xdp_umem *umem)
|
|
|
|
{
|
|
|
|
unsigned long lock_limit, new_npgs, old_npgs;
|
|
|
|
|
|
|
|
if (capable(CAP_IPC_LOCK))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
|
|
|
|
umem->user = get_uid(current_user());
|
|
|
|
|
|
|
|
do {
|
|
|
|
old_npgs = atomic_long_read(&umem->user->locked_vm);
|
|
|
|
new_npgs = old_npgs + umem->npgs;
|
|
|
|
if (new_npgs > lock_limit) {
|
|
|
|
free_uid(umem->user);
|
|
|
|
umem->user = NULL;
|
|
|
|
return -ENOBUFS;
|
|
|
|
}
|
|
|
|
} while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
|
|
|
|
new_npgs) != old_npgs);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-05-22 10:35:02 +03:00
|
|
|
static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
|
2018-05-02 14:01:23 +03:00
|
|
|
{
|
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 14:57:13 +03:00
|
|
|
u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
|
|
|
|
unsigned int chunks, chunks_per_page;
|
2018-05-02 14:01:23 +03:00
|
|
|
u64 addr = mr->addr, size = mr->len;
|
2018-06-04 15:05:52 +03:00
|
|
|
int size_chk, err, i;
|
2018-05-02 14:01:23 +03:00
|
|
|
|
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 14:57:13 +03:00
|
|
|
if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
|
2018-05-02 14:01:23 +03:00
|
|
|
/* Strictly speaking we could support this, if:
|
|
|
|
* - huge pages, or*
|
|
|
|
* - using an IOMMU, or
|
|
|
|
* - making sure the memory area is consecutive
|
|
|
|
* but for now, we simply say "computer says no".
|
|
|
|
*/
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 14:57:13 +03:00
|
|
|
if (!is_power_of_2(chunk_size))
|
2018-05-02 14:01:23 +03:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (!PAGE_ALIGNED(addr)) {
|
|
|
|
/* Memory area has to be page size aligned. For
|
|
|
|
* simplicity, this might change.
|
|
|
|
*/
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((addr + size) < addr)
|
|
|
|
return -EINVAL;
|
|
|
|
|
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 14:57:13 +03:00
|
|
|
chunks = (unsigned int)div_u64(size, chunk_size);
|
|
|
|
if (chunks == 0)
|
2018-05-02 14:01:23 +03:00
|
|
|
return -EINVAL;
|
|
|
|
|
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 14:57:13 +03:00
|
|
|
chunks_per_page = PAGE_SIZE / chunk_size;
|
|
|
|
if (chunks < chunks_per_page || chunks % chunks_per_page)
|
2018-05-02 14:01:23 +03:00
|
|
|
return -EINVAL;
|
|
|
|
|
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 14:57:13 +03:00
|
|
|
headroom = ALIGN(headroom, 64);
|
2018-05-02 14:01:23 +03:00
|
|
|
|
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 14:57:13 +03:00
|
|
|
size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
|
2018-05-02 14:01:23 +03:00
|
|
|
if (size_chk < 0)
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
umem->address = (unsigned long)addr;
|
2018-08-31 14:40:02 +03:00
|
|
|
umem->chunk_mask = ~((u64)chunk_size - 1);
|
|
|
|
umem->size = size;
|
xsk: new descriptor addressing scheme
Currently, AF_XDP only supports a fixed frame-size memory scheme where
each frame is referenced via an index (idx). A user passes the frame
index to the kernel, and the kernel acts upon the data. Some NICs,
however, do not have a fixed frame-size model, instead they have a
model where a memory window is passed to the hardware and multiple
frames are filled into that window (referred to as the "type-writer"
model).
By changing the descriptor format from the current frame index
addressing scheme, AF_XDP can in the future be extended to support
these kinds of NICs.
In the index-based model, an idx refers to a frame of size
frame_size. Addressing a frame in the UMEM is done by offseting the
UMEM starting address by a global offset, idx * frame_size + offset.
Communicating via the fill- and completion-rings are done by means of
idx.
In this commit, the idx is removed in favor of an address (addr),
which is a relative address ranging over the UMEM. To convert an
idx-based address to the new addr is simply: addr = idx * frame_size +
offset.
We also stop referring to the UMEM "frame" as a frame. Instead it is
simply called a chunk.
To transfer ownership of a chunk to the kernel, the addr of the chunk
is passed in the fill-ring. Note, that the kernel will mask addr to
make it chunk aligned, so there is no need for userspace to do
that. E.g., for a chunk size of 2k, passing an addr of 2048, 2050 or
3000 to the fill-ring will refer to the same chunk.
On the completion-ring, the addr will match that of the Tx descriptor,
passed to the kernel.
Changing the descriptor format to use chunks/addr will allow for
future changes to move to a type-writer based model, where multiple
frames can reside in one chunk. In this model passing one single chunk
into the fill-ring, would potentially result in multiple Rx
descriptors.
This commit changes the uapi of AF_XDP sockets, and updates the
documentation.
Signed-off-by: Björn Töpel <bjorn.topel@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-06-04 14:57:13 +03:00
|
|
|
umem->headroom = headroom;
|
|
|
|
umem->chunk_size_nohr = chunk_size - headroom;
|
2018-05-02 14:01:23 +03:00
|
|
|
umem->npgs = size / PAGE_SIZE;
|
|
|
|
umem->pgs = NULL;
|
|
|
|
umem->user = NULL;
|
2018-06-04 15:05:57 +03:00
|
|
|
INIT_LIST_HEAD(&umem->xsk_list);
|
|
|
|
spin_lock_init(&umem->xsk_list_lock);
|
2018-05-02 14:01:23 +03:00
|
|
|
|
2018-05-22 10:35:03 +03:00
|
|
|
refcount_set(&umem->users, 1);
|
2018-05-02 14:01:23 +03:00
|
|
|
|
|
|
|
err = xdp_umem_account_pages(umem);
|
|
|
|
if (err)
|
2019-03-13 17:15:49 +03:00
|
|
|
return err;
|
2018-05-02 14:01:23 +03:00
|
|
|
|
|
|
|
err = xdp_umem_pin_pages(umem);
|
|
|
|
if (err)
|
|
|
|
goto out_account;
|
2018-06-04 15:05:52 +03:00
|
|
|
|
|
|
|
umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
|
|
|
|
if (!umem->pages) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out_account;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; i < umem->npgs; i++)
|
|
|
|
umem->pages[i].addr = page_address(umem->pgs[i]);
|
|
|
|
|
2018-05-02 14:01:23 +03:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
out_account:
|
|
|
|
xdp_umem_unaccount_pages(umem);
|
|
|
|
return err;
|
|
|
|
}
|
2018-05-02 14:01:26 +03:00
|
|
|
|
2018-05-22 10:35:02 +03:00
|
|
|
struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
|
|
|
|
{
|
|
|
|
struct xdp_umem *umem;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
umem = kzalloc(sizeof(*umem), GFP_KERNEL);
|
|
|
|
if (!umem)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
2019-01-24 21:59:38 +03:00
|
|
|
err = ida_simple_get(&umem_ida, 0, 0, GFP_KERNEL);
|
|
|
|
if (err < 0) {
|
|
|
|
kfree(umem);
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
|
|
|
umem->id = err;
|
|
|
|
|
2018-05-22 10:35:02 +03:00
|
|
|
err = xdp_umem_reg(umem, mr);
|
|
|
|
if (err) {
|
2019-01-24 21:59:38 +03:00
|
|
|
ida_simple_remove(&umem_ida, umem->id);
|
2018-05-22 10:35:02 +03:00
|
|
|
kfree(umem);
|
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
|
|
|
|
|
|
|
return umem;
|
|
|
|
}
|
|
|
|
|
2018-05-02 14:01:26 +03:00
|
|
|
bool xdp_umem_validate_queues(struct xdp_umem *umem)
|
|
|
|
{
|
2018-05-18 15:00:23 +03:00
|
|
|
return umem->fq && umem->cq;
|
2018-05-02 14:01:26 +03:00
|
|
|
}
|