2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
|
|
* operating system. INET is implemented using the BSD Socket
|
|
|
|
* interface as the means of communication with the user level.
|
|
|
|
*
|
|
|
|
* The User Datagram Protocol (UDP).
|
|
|
|
*
|
2005-05-06 03:16:16 +04:00
|
|
|
* Authors: Ross Biro
|
2005-04-17 02:20:36 +04:00
|
|
|
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
|
|
|
|
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
|
2008-10-14 06:01:08 +04:00
|
|
|
* Alan Cox, <alan@lxorguk.ukuu.org.uk>
|
2005-04-17 02:20:36 +04:00
|
|
|
* Hirokazu Takahashi, <taka@valinux.co.jp>
|
|
|
|
*
|
|
|
|
* Fixes:
|
|
|
|
* Alan Cox : verify_area() calls
|
|
|
|
* Alan Cox : stopped close while in use off icmp
|
|
|
|
* messages. Not a fix but a botch that
|
|
|
|
* for udp at least is 'valid'.
|
|
|
|
* Alan Cox : Fixed icmp handling properly
|
|
|
|
* Alan Cox : Correct error for oversized datagrams
|
2007-02-09 17:24:47 +03:00
|
|
|
* Alan Cox : Tidied select() semantics.
|
|
|
|
* Alan Cox : udp_err() fixed properly, also now
|
2005-04-17 02:20:36 +04:00
|
|
|
* select and read wake correctly on errors
|
|
|
|
* Alan Cox : udp_send verify_area moved to avoid mem leak
|
|
|
|
* Alan Cox : UDP can count its memory
|
|
|
|
* Alan Cox : send to an unknown connection causes
|
|
|
|
* an ECONNREFUSED off the icmp, but
|
|
|
|
* does NOT close.
|
|
|
|
* Alan Cox : Switched to new sk_buff handlers. No more backlog!
|
|
|
|
* Alan Cox : Using generic datagram code. Even smaller and the PEEK
|
|
|
|
* bug no longer crashes it.
|
|
|
|
* Fred Van Kempen : Net2e support for sk->broadcast.
|
|
|
|
* Alan Cox : Uses skb_free_datagram
|
|
|
|
* Alan Cox : Added get/set sockopt support.
|
|
|
|
* Alan Cox : Broadcasting without option set returns EACCES.
|
|
|
|
* Alan Cox : No wakeup calls. Instead we now use the callbacks.
|
|
|
|
* Alan Cox : Use ip_tos and ip_ttl
|
|
|
|
* Alan Cox : SNMP Mibs
|
|
|
|
* Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support.
|
|
|
|
* Matt Dillon : UDP length checks.
|
|
|
|
* Alan Cox : Smarter af_inet used properly.
|
|
|
|
* Alan Cox : Use new kernel side addressing.
|
|
|
|
* Alan Cox : Incorrect return on truncated datagram receive.
|
|
|
|
* Arnt Gulbrandsen : New udp_send and stuff
|
|
|
|
* Alan Cox : Cache last socket
|
|
|
|
* Alan Cox : Route cache
|
|
|
|
* Jon Peatfield : Minor efficiency fix to sendto().
|
|
|
|
* Mike Shaver : RFC1122 checks.
|
|
|
|
* Alan Cox : Nonblocking error fix.
|
|
|
|
* Willy Konynenberg : Transparent proxying support.
|
|
|
|
* Mike McLagan : Routing by source
|
|
|
|
* David S. Miller : New socket lookup architecture.
|
|
|
|
* Last socket cache retained as it
|
|
|
|
* does have a high hit rate.
|
|
|
|
* Olaf Kirch : Don't linearise iovec on sendmsg.
|
|
|
|
* Andi Kleen : Some cleanups, cache destination entry
|
2007-02-09 17:24:47 +03:00
|
|
|
* for connect.
|
2005-04-17 02:20:36 +04:00
|
|
|
* Vitaly E. Lavrov : Transparent proxy revived after year coma.
|
|
|
|
* Melvin Smith : Check msg_name not msg_namelen in sendto(),
|
|
|
|
* return ENOTCONN for unconnected sockets (POSIX)
|
|
|
|
* Janos Farkas : don't deliver multi/broadcasts to a different
|
|
|
|
* bound-to-device socket
|
|
|
|
* Hirokazu Takahashi : HW checksumming for outgoing UDP
|
|
|
|
* datagrams.
|
|
|
|
* Hirokazu Takahashi : sendfile() on UDP works now.
|
|
|
|
* Arnaldo C. Melo : convert /proc/net/udp to seq_file
|
|
|
|
* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
|
|
|
|
* Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind
|
|
|
|
* a single port at the same time.
|
|
|
|
* Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
|
2007-06-28 02:37:46 +04:00
|
|
|
* James Chapman : Add L2TP encapsulation type.
|
2005-04-17 02:20:36 +04:00
|
|
|
*
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*/
|
2007-02-09 17:24:47 +03:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
#include <asm/system.h>
|
|
|
|
#include <asm/uaccess.h>
|
|
|
|
#include <asm/ioctls.h>
|
2007-12-31 11:29:24 +03:00
|
|
|
#include <linux/bootmem.h>
|
2008-10-29 12:32:32 +03:00
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/swap.h>
|
2005-04-17 02:20:36 +04:00
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/fcntl.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/socket.h>
|
|
|
|
#include <linux/sockios.h>
|
2005-12-27 07:43:12 +03:00
|
|
|
#include <linux/igmp.h>
|
2005-04-17 02:20:36 +04:00
|
|
|
#include <linux/in.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/timer.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/inet.h>
|
|
|
|
#include <linux/netdevice.h>
|
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h
percpu.h is included by sched.h and module.h and thus ends up being
included when building most .c files. percpu.h includes slab.h which
in turn includes gfp.h making everything defined by the two files
universally available and complicating inclusion dependencies.
percpu.h -> slab.h dependency is about to be removed. Prepare for
this change by updating users of gfp and slab facilities include those
headers directly instead of assuming availability. As this conversion
needs to touch large number of source files, the following script is
used as the basis of conversion.
http://userweb.kernel.org/~tj/misc/slabh-sweep.py
The script does the followings.
* Scan files for gfp and slab usages and update includes such that
only the necessary includes are there. ie. if only gfp is used,
gfp.h, if slab is used, slab.h.
* When the script inserts a new include, it looks at the include
blocks and try to put the new include such that its order conforms
to its surrounding. It's put in the include block which contains
core kernel includes, in the same order that the rest are ordered -
alphabetical, Christmas tree, rev-Xmas-tree or at the end if there
doesn't seem to be any matching order.
* If the script can't find a place to put a new include (mostly
because the file doesn't have fitting include block), it prints out
an error message indicating which .h file needs to be added to the
file.
The conversion was done in the following steps.
1. The initial automatic conversion of all .c files updated slightly
over 4000 files, deleting around 700 includes and adding ~480 gfp.h
and ~3000 slab.h inclusions. The script emitted errors for ~400
files.
2. Each error was manually checked. Some didn't need the inclusion,
some needed manual addition while adding it to implementation .h or
embedding .c file was more appropriate for others. This step added
inclusions to around 150 files.
3. The script was run again and the output was compared to the edits
from #2 to make sure no file was left behind.
4. Several build tests were done and a couple of problems were fixed.
e.g. lib/decompress_*.c used malloc/free() wrappers around slab
APIs requiring slab.h to be added manually.
5. The script was run on all .h files but without automatically
editing them as sprinkling gfp.h and slab.h inclusions around .h
files could easily lead to inclusion dependency hell. Most gfp.h
inclusion directives were ignored as stuff from gfp.h was usually
wildly available and often used in preprocessor macros. Each
slab.h inclusion directive was examined and added manually as
necessary.
6. percpu.h was updated not to include slab.h.
7. Build test were done on the following configurations and failures
were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my
distributed build env didn't work with gcov compiles) and a few
more options had to be turned off depending on archs to make things
build (like ipr on powerpc/64 which failed due to missing writeq).
* x86 and x86_64 UP and SMP allmodconfig and a custom test config.
* powerpc and powerpc64 SMP allmodconfig
* sparc and sparc64 SMP allmodconfig
* ia64 SMP allmodconfig
* s390 SMP allmodconfig
* alpha SMP allmodconfig
* um on x86_64 SMP allmodconfig
8. percpu.h modifications were reverted so that it could be applied as
a separate patch and serve as bisection point.
Given the fact that I had only a couple of failures from tests on step
6, I'm fairly confident about the coverage of this conversion patch.
If there is a breakage, it's likely to be something in one of the arch
headers which should be easily discoverable easily on most builds of
the specific arch.
Signed-off-by: Tejun Heo <tj@kernel.org>
Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 11:04:11 +03:00
|
|
|
#include <linux/slab.h>
|
2005-08-10 07:08:28 +04:00
|
|
|
#include <net/tcp_states.h>
|
2005-04-17 02:20:36 +04:00
|
|
|
#include <linux/skbuff.h>
|
|
|
|
#include <linux/proc_fs.h>
|
|
|
|
#include <linux/seq_file.h>
|
2007-09-12 14:01:34 +04:00
|
|
|
#include <net/net_namespace.h>
|
2005-04-17 02:20:36 +04:00
|
|
|
#include <net/icmp.h>
|
|
|
|
#include <net/route.h>
|
|
|
|
#include <net/checksum.h>
|
|
|
|
#include <net/xfrm.h>
|
2006-11-27 22:10:57 +03:00
|
|
|
#include "udp_impl.h"
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2009-10-07 04:37:59 +04:00
|
|
|
struct udp_table udp_table __read_mostly;
|
2008-10-29 11:41:45 +03:00
|
|
|
EXPORT_SYMBOL(udp_table);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2007-12-31 11:29:24 +03:00
|
|
|
int sysctl_udp_mem[3] __read_mostly;
|
|
|
|
EXPORT_SYMBOL(sysctl_udp_mem);
|
2009-07-17 04:26:32 +04:00
|
|
|
|
|
|
|
int sysctl_udp_rmem_min __read_mostly;
|
2007-12-31 11:29:24 +03:00
|
|
|
EXPORT_SYMBOL(sysctl_udp_rmem_min);
|
2009-07-17 04:26:32 +04:00
|
|
|
|
|
|
|
int sysctl_udp_wmem_min __read_mostly;
|
2007-12-31 11:29:24 +03:00
|
|
|
EXPORT_SYMBOL(sysctl_udp_wmem_min);
|
|
|
|
|
|
|
|
atomic_t udp_memory_allocated;
|
|
|
|
EXPORT_SYMBOL(udp_memory_allocated);
|
|
|
|
|
2009-10-07 04:37:59 +04:00
|
|
|
#define MAX_UDP_PORTS 65536
|
|
|
|
#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
|
2009-01-27 08:35:35 +03:00
|
|
|
|
2008-10-10 01:51:27 +04:00
|
|
|
static int udp_lib_lport_inuse(struct net *net, __u16 num,
|
2008-10-29 11:41:45 +03:00
|
|
|
const struct udp_hslot *hslot,
|
2009-01-27 08:35:35 +03:00
|
|
|
unsigned long *bitmap,
|
2008-10-10 01:51:27 +04:00
|
|
|
struct sock *sk,
|
|
|
|
int (*saddr_comp)(const struct sock *sk1,
|
2009-10-07 04:37:59 +04:00
|
|
|
const struct sock *sk2),
|
|
|
|
unsigned int log)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2008-10-10 01:51:27 +04:00
|
|
|
struct sock *sk2;
|
2008-11-17 06:39:21 +03:00
|
|
|
struct hlist_nulls_node *node;
|
2006-08-27 07:06:05 +04:00
|
|
|
|
2008-11-17 06:39:21 +03:00
|
|
|
sk_nulls_for_each(sk2, node, &hslot->head)
|
2009-11-23 21:41:23 +03:00
|
|
|
if (net_eq(sock_net(sk2), net) &&
|
|
|
|
sk2 != sk &&
|
2009-11-08 13:17:30 +03:00
|
|
|
(bitmap || udp_sk(sk2)->udp_port_hash == num) &&
|
2009-11-23 21:41:23 +03:00
|
|
|
(!sk2->sk_reuse || !sk->sk_reuse) &&
|
|
|
|
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
|
|
|
|
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
|
2009-01-27 08:35:35 +03:00
|
|
|
(*saddr_comp)(sk, sk2)) {
|
|
|
|
if (bitmap)
|
2009-11-08 13:17:30 +03:00
|
|
|
__set_bit(udp_sk(sk2)->udp_port_hash >> log,
|
|
|
|
bitmap);
|
2009-01-27 08:35:35 +03:00
|
|
|
else
|
|
|
|
return 1;
|
|
|
|
}
|
2006-08-27 07:06:05 +04:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-11-09 08:26:33 +03:00
|
|
|
/*
|
|
|
|
* Note: we still hold spinlock of primary hash chain, so no other writer
|
|
|
|
* can insert/delete a socket with local_port == num
|
|
|
|
*/
|
|
|
|
static int udp_lib_lport_inuse2(struct net *net, __u16 num,
|
|
|
|
struct udp_hslot *hslot2,
|
|
|
|
struct sock *sk,
|
|
|
|
int (*saddr_comp)(const struct sock *sk1,
|
|
|
|
const struct sock *sk2))
|
|
|
|
{
|
|
|
|
struct sock *sk2;
|
|
|
|
struct hlist_nulls_node *node;
|
|
|
|
int res = 0;
|
|
|
|
|
|
|
|
spin_lock(&hslot2->lock);
|
|
|
|
udp_portaddr_for_each_entry(sk2, node, &hslot2->head)
|
2009-11-23 21:41:23 +03:00
|
|
|
if (net_eq(sock_net(sk2), net) &&
|
|
|
|
sk2 != sk &&
|
|
|
|
(udp_sk(sk2)->udp_port_hash == num) &&
|
|
|
|
(!sk2->sk_reuse || !sk->sk_reuse) &&
|
|
|
|
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
|
|
|
|
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
|
2009-11-09 08:26:33 +03:00
|
|
|
(*saddr_comp)(sk, sk2)) {
|
|
|
|
res = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
spin_unlock(&hslot2->lock);
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2006-08-27 07:06:05 +04:00
|
|
|
/**
|
2008-03-23 02:51:21 +03:00
|
|
|
* udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
|
2006-08-27 07:06:05 +04:00
|
|
|
*
|
|
|
|
* @sk: socket struct in question
|
|
|
|
* @snum: port number to look up
|
2007-06-06 02:18:43 +04:00
|
|
|
* @saddr_comp: AF-dependent comparison of bound local IP addresses
|
2009-11-09 08:26:33 +03:00
|
|
|
* @hash2_nulladdr: AF-dependant hash value in secondary hash chains,
|
|
|
|
* with NULL address
|
2006-08-27 07:06:05 +04:00
|
|
|
*/
|
2008-03-23 02:51:21 +03:00
|
|
|
int udp_lib_get_port(struct sock *sk, unsigned short snum,
|
2007-06-06 02:18:43 +04:00
|
|
|
int (*saddr_comp)(const struct sock *sk1,
|
2009-11-09 08:26:33 +03:00
|
|
|
const struct sock *sk2),
|
|
|
|
unsigned int hash2_nulladdr)
|
2006-08-27 07:06:05 +04:00
|
|
|
{
|
2009-11-08 13:17:58 +03:00
|
|
|
struct udp_hslot *hslot, *hslot2;
|
2008-10-29 11:41:45 +03:00
|
|
|
struct udp_table *udptable = sk->sk_prot->h.udp_table;
|
2006-08-27 07:06:05 +04:00
|
|
|
int error = 1;
|
2008-03-25 20:26:21 +03:00
|
|
|
struct net *net = sock_net(sk);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2007-08-25 10:09:41 +04:00
|
|
|
if (!snum) {
|
2008-10-08 22:44:17 +04:00
|
|
|
int low, high, remaining;
|
|
|
|
unsigned rand;
|
2009-01-27 08:35:35 +03:00
|
|
|
unsigned short first, last;
|
|
|
|
DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
|
2007-08-25 10:09:41 +04:00
|
|
|
|
2007-10-11 04:30:46 +04:00
|
|
|
inet_get_local_port_range(&low, &high);
|
2007-10-19 09:00:17 +04:00
|
|
|
remaining = (high - low) + 1;
|
2007-10-11 04:30:46 +04:00
|
|
|
|
2008-10-08 22:44:17 +04:00
|
|
|
rand = net_random();
|
2009-01-27 08:35:35 +03:00
|
|
|
first = (((u64)rand * remaining) >> 32) + low;
|
|
|
|
/*
|
|
|
|
* force rand to be an odd multiple of UDP_HTABLE_SIZE
|
|
|
|
*/
|
2009-10-07 04:37:59 +04:00
|
|
|
rand = (rand | 1) * (udptable->mask + 1);
|
2009-12-14 06:32:39 +03:00
|
|
|
last = first + udptable->mask + 1;
|
|
|
|
do {
|
2009-10-07 04:37:59 +04:00
|
|
|
hslot = udp_hashslot(udptable, net, first);
|
2009-01-27 08:35:35 +03:00
|
|
|
bitmap_zero(bitmap, PORTS_PER_CHAIN);
|
2008-10-29 11:41:45 +03:00
|
|
|
spin_lock_bh(&hslot->lock);
|
2009-01-27 08:35:35 +03:00
|
|
|
udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
|
2009-10-07 04:37:59 +04:00
|
|
|
saddr_comp, udptable->log);
|
2009-01-27 08:35:35 +03:00
|
|
|
|
|
|
|
snum = first;
|
|
|
|
/*
|
|
|
|
* Iterate on all possible values of snum for this hash.
|
|
|
|
* Using steps of an odd multiple of UDP_HTABLE_SIZE
|
|
|
|
* give us randomization and full range coverage.
|
|
|
|
*/
|
2008-10-08 22:44:17 +04:00
|
|
|
do {
|
2009-01-27 08:35:35 +03:00
|
|
|
if (low <= snum && snum <= high &&
|
2010-05-05 04:27:06 +04:00
|
|
|
!test_bit(snum >> udptable->log, bitmap) &&
|
|
|
|
!inet_is_reserved_local_port(snum))
|
2009-01-27 08:35:35 +03:00
|
|
|
goto found;
|
|
|
|
snum += rand;
|
|
|
|
} while (snum != first);
|
|
|
|
spin_unlock_bh(&hslot->lock);
|
2009-12-14 06:32:39 +03:00
|
|
|
} while (++first != last);
|
2009-01-27 08:35:35 +03:00
|
|
|
goto fail;
|
2008-10-29 11:41:45 +03:00
|
|
|
} else {
|
2009-10-07 04:37:59 +04:00
|
|
|
hslot = udp_hashslot(udptable, net, snum);
|
2008-10-29 11:41:45 +03:00
|
|
|
spin_lock_bh(&hslot->lock);
|
2009-11-09 08:26:33 +03:00
|
|
|
if (hslot->count > 10) {
|
|
|
|
int exist;
|
|
|
|
unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
|
|
|
|
|
|
|
|
slot2 &= udptable->mask;
|
|
|
|
hash2_nulladdr &= udptable->mask;
|
|
|
|
|
|
|
|
hslot2 = udp_hashslot2(udptable, slot2);
|
|
|
|
if (hslot->count < hslot2->count)
|
|
|
|
goto scan_primary_hash;
|
|
|
|
|
|
|
|
exist = udp_lib_lport_inuse2(net, snum, hslot2,
|
|
|
|
sk, saddr_comp);
|
|
|
|
if (!exist && (hash2_nulladdr != slot2)) {
|
|
|
|
hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
|
|
|
|
exist = udp_lib_lport_inuse2(net, snum, hslot2,
|
|
|
|
sk, saddr_comp);
|
|
|
|
}
|
|
|
|
if (exist)
|
|
|
|
goto fail_unlock;
|
|
|
|
else
|
|
|
|
goto found;
|
|
|
|
}
|
|
|
|
scan_primary_hash:
|
2009-10-07 04:37:59 +04:00
|
|
|
if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
|
|
|
|
saddr_comp, 0))
|
2008-10-29 11:41:45 +03:00
|
|
|
goto fail_unlock;
|
|
|
|
}
|
2009-01-27 08:35:35 +03:00
|
|
|
found:
|
2009-10-15 10:30:45 +04:00
|
|
|
inet_sk(sk)->inet_num = snum;
|
2009-11-08 13:17:30 +03:00
|
|
|
udp_sk(sk)->udp_port_hash = snum;
|
|
|
|
udp_sk(sk)->udp_portaddr_hash ^= snum;
|
2005-04-17 02:20:36 +04:00
|
|
|
if (sk_unhashed(sk)) {
|
2008-11-17 06:39:21 +03:00
|
|
|
sk_nulls_add_node_rcu(sk, &hslot->head);
|
2009-11-08 13:17:05 +03:00
|
|
|
hslot->count++;
|
2008-04-01 06:41:46 +04:00
|
|
|
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
|
2009-11-08 13:17:58 +03:00
|
|
|
|
|
|
|
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
|
|
|
|
spin_lock(&hslot2->lock);
|
|
|
|
hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
|
|
|
|
&hslot2->head);
|
|
|
|
hslot2->count++;
|
|
|
|
spin_unlock(&hslot2->lock);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2006-08-27 07:06:05 +04:00
|
|
|
error = 0;
|
2008-10-29 11:41:45 +03:00
|
|
|
fail_unlock:
|
|
|
|
spin_unlock_bh(&hslot->lock);
|
2005-04-17 02:20:36 +04:00
|
|
|
fail:
|
2006-08-27 07:06:05 +04:00
|
|
|
return error;
|
|
|
|
}
|
2009-07-17 04:26:32 +04:00
|
|
|
EXPORT_SYMBOL(udp_lib_get_port);
|
2006-08-27 07:06:05 +04:00
|
|
|
|
2009-04-09 21:37:33 +04:00
|
|
|
static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
|
2008-03-07 03:22:02 +03:00
|
|
|
{
|
|
|
|
struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
|
|
|
|
|
2009-07-17 04:26:32 +04:00
|
|
|
return (!ipv6_only_sock(sk2) &&
|
2009-10-15 10:30:45 +04:00
|
|
|
(!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
|
|
|
|
inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
|
2008-03-07 03:22:02 +03:00
|
|
|
}
|
|
|
|
|
2009-11-08 13:17:30 +03:00
|
|
|
static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
|
|
|
|
unsigned int port)
|
|
|
|
{
|
2010-04-21 06:06:52 +04:00
|
|
|
return jhash_1word((__force u32)saddr, net_hash_mix(net)) ^ port;
|
2009-11-08 13:17:30 +03:00
|
|
|
}
|
|
|
|
|
2008-03-23 02:51:21 +03:00
|
|
|
int udp_v4_get_port(struct sock *sk, unsigned short snum)
|
2008-03-07 03:22:02 +03:00
|
|
|
{
|
2009-11-09 08:26:33 +03:00
|
|
|
unsigned int hash2_nulladdr =
|
2010-04-21 06:06:52 +04:00
|
|
|
udp4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum);
|
2009-11-09 08:26:33 +03:00
|
|
|
unsigned int hash2_partial =
|
|
|
|
udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
|
|
|
|
|
2009-11-08 13:17:30 +03:00
|
|
|
/* precompute partial secondary hash */
|
2009-11-09 08:26:33 +03:00
|
|
|
udp_sk(sk)->udp_portaddr_hash = hash2_partial;
|
|
|
|
return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
|
2008-03-07 03:22:02 +03:00
|
|
|
}
|
|
|
|
|
2008-10-29 11:41:45 +03:00
|
|
|
static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
|
|
|
|
unsigned short hnum,
|
|
|
|
__be16 sport, __be32 daddr, __be16 dport, int dif)
|
|
|
|
{
|
|
|
|
int score = -1;
|
|
|
|
|
2009-11-08 13:17:30 +03:00
|
|
|
if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
|
2008-10-29 11:41:45 +03:00
|
|
|
!ipv6_only_sock(sk)) {
|
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
|
|
|
|
|
|
|
score = (sk->sk_family == PF_INET ? 1 : 0);
|
2009-10-15 10:30:45 +04:00
|
|
|
if (inet->inet_rcv_saddr) {
|
|
|
|
if (inet->inet_rcv_saddr != daddr)
|
2008-10-29 11:41:45 +03:00
|
|
|
return -1;
|
|
|
|
score += 2;
|
|
|
|
}
|
2009-10-15 10:30:45 +04:00
|
|
|
if (inet->inet_daddr) {
|
|
|
|
if (inet->inet_daddr != saddr)
|
2008-10-29 11:41:45 +03:00
|
|
|
return -1;
|
|
|
|
score += 2;
|
|
|
|
}
|
2009-10-15 10:30:45 +04:00
|
|
|
if (inet->inet_dport) {
|
|
|
|
if (inet->inet_dport != sport)
|
2008-10-29 11:41:45 +03:00
|
|
|
return -1;
|
|
|
|
score += 2;
|
|
|
|
}
|
|
|
|
if (sk->sk_bound_dev_if) {
|
|
|
|
if (sk->sk_bound_dev_if != dif)
|
|
|
|
return -1;
|
|
|
|
score += 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return score;
|
|
|
|
}
|
|
|
|
|
ipv4: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain we added in previous patch.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, INADDR_ANY) chain to find socket not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 13:18:11 +03:00
|
|
|
/*
|
|
|
|
* In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
|
|
|
|
*/
|
|
|
|
#define SCORE2_MAX (1 + 2 + 2 + 2)
|
|
|
|
static inline int compute_score2(struct sock *sk, struct net *net,
|
|
|
|
__be32 saddr, __be16 sport,
|
|
|
|
__be32 daddr, unsigned int hnum, int dif)
|
|
|
|
{
|
|
|
|
int score = -1;
|
|
|
|
|
|
|
|
if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) {
|
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
|
|
|
|
|
|
|
if (inet->inet_rcv_saddr != daddr)
|
|
|
|
return -1;
|
|
|
|
if (inet->inet_num != hnum)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
score = (sk->sk_family == PF_INET ? 1 : 0);
|
|
|
|
if (inet->inet_daddr) {
|
|
|
|
if (inet->inet_daddr != saddr)
|
|
|
|
return -1;
|
|
|
|
score += 2;
|
|
|
|
}
|
|
|
|
if (inet->inet_dport) {
|
|
|
|
if (inet->inet_dport != sport)
|
|
|
|
return -1;
|
|
|
|
score += 2;
|
|
|
|
}
|
|
|
|
if (sk->sk_bound_dev_if) {
|
|
|
|
if (sk->sk_bound_dev_if != dif)
|
|
|
|
return -1;
|
|
|
|
score += 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return score;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* called with read_rcu_lock() */
|
|
|
|
static struct sock *udp4_lib_lookup2(struct net *net,
|
|
|
|
__be32 saddr, __be16 sport,
|
|
|
|
__be32 daddr, unsigned int hnum, int dif,
|
|
|
|
struct udp_hslot *hslot2, unsigned int slot2)
|
|
|
|
{
|
|
|
|
struct sock *sk, *result;
|
|
|
|
struct hlist_nulls_node *node;
|
|
|
|
int score, badness;
|
|
|
|
|
|
|
|
begin:
|
|
|
|
result = NULL;
|
|
|
|
badness = -1;
|
|
|
|
udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
|
|
|
|
score = compute_score2(sk, net, saddr, sport,
|
|
|
|
daddr, hnum, dif);
|
|
|
|
if (score > badness) {
|
|
|
|
result = sk;
|
|
|
|
badness = score;
|
|
|
|
if (score == SCORE2_MAX)
|
|
|
|
goto exact_match;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* if the nulls value we got at the end of this lookup is
|
|
|
|
* not the expected one, we must restart lookup.
|
|
|
|
* We probably met an item that was moved to another chain.
|
|
|
|
*/
|
|
|
|
if (get_nulls_value(node) != slot2)
|
|
|
|
goto begin;
|
|
|
|
|
|
|
|
if (result) {
|
|
|
|
exact_match:
|
|
|
|
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
|
|
|
|
result = NULL;
|
|
|
|
else if (unlikely(compute_score2(result, net, saddr, sport,
|
|
|
|
daddr, hnum, dif) < badness)) {
|
|
|
|
sock_put(result);
|
|
|
|
goto begin;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2008-03-07 03:22:02 +03:00
|
|
|
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
|
|
|
|
* harder than this. -DaveM
|
|
|
|
*/
|
|
|
|
static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
|
|
|
|
__be16 sport, __be32 daddr, __be16 dport,
|
2008-10-29 11:41:45 +03:00
|
|
|
int dif, struct udp_table *udptable)
|
2008-03-07 03:22:02 +03:00
|
|
|
{
|
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
|
|
|
struct sock *sk, *result;
|
2008-11-17 06:39:21 +03:00
|
|
|
struct hlist_nulls_node *node;
|
2008-03-07 03:22:02 +03:00
|
|
|
unsigned short hnum = ntohs(dport);
|
ipv4: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain we added in previous patch.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, INADDR_ANY) chain to find socket not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 13:18:11 +03:00
|
|
|
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
|
|
|
|
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
|
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
|
|
|
int score, badness;
|
2008-10-29 11:41:45 +03:00
|
|
|
|
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
|
|
|
rcu_read_lock();
|
ipv4: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain we added in previous patch.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, INADDR_ANY) chain to find socket not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 13:18:11 +03:00
|
|
|
if (hslot->count > 10) {
|
|
|
|
hash2 = udp4_portaddr_hash(net, daddr, hnum);
|
|
|
|
slot2 = hash2 & udptable->mask;
|
|
|
|
hslot2 = &udptable->hash2[slot2];
|
|
|
|
if (hslot->count < hslot2->count)
|
|
|
|
goto begin;
|
|
|
|
|
|
|
|
result = udp4_lib_lookup2(net, saddr, sport,
|
|
|
|
daddr, hnum, dif,
|
|
|
|
hslot2, slot2);
|
|
|
|
if (!result) {
|
2010-04-21 06:06:52 +04:00
|
|
|
hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
|
ipv4: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain we added in previous patch.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, INADDR_ANY) chain to find socket not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 13:18:11 +03:00
|
|
|
slot2 = hash2 & udptable->mask;
|
|
|
|
hslot2 = &udptable->hash2[slot2];
|
|
|
|
if (hslot->count < hslot2->count)
|
|
|
|
goto begin;
|
|
|
|
|
2010-04-08 08:56:48 +04:00
|
|
|
result = udp4_lib_lookup2(net, saddr, sport,
|
2010-04-21 06:06:52 +04:00
|
|
|
htonl(INADDR_ANY), hnum, dif,
|
ipv4: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain we added in previous patch.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, INADDR_ANY) chain to find socket not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 13:18:11 +03:00
|
|
|
hslot2, slot2);
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
return result;
|
|
|
|
}
|
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
|
|
|
begin:
|
|
|
|
result = NULL;
|
|
|
|
badness = -1;
|
2008-11-17 06:39:21 +03:00
|
|
|
sk_nulls_for_each_rcu(sk, node, &hslot->head) {
|
2008-10-29 11:41:45 +03:00
|
|
|
score = compute_score(sk, net, saddr, hnum, sport,
|
|
|
|
daddr, dport, dif);
|
|
|
|
if (score > badness) {
|
|
|
|
result = sk;
|
|
|
|
badness = score;
|
2008-03-07 03:22:02 +03:00
|
|
|
}
|
|
|
|
}
|
2008-11-17 06:39:21 +03:00
|
|
|
/*
|
|
|
|
* if the nulls value we got at the end of this lookup is
|
|
|
|
* not the expected one, we must restart lookup.
|
|
|
|
* We probably met an item that was moved to another chain.
|
|
|
|
*/
|
ipv4: udp: optimize unicast RX path
We first locate the (local port) hash chain head
If few sockets are in this chain, we proceed with previous lookup algo.
If too many sockets are listed, we take a look at the secondary
(port, address) hash chain we added in previous patch.
We choose the shortest chain and proceed with a RCU lookup on the elected chain.
But, if we chose (port, address) chain, and fail to find a socket on given address,
we must try another lookup on (port, INADDR_ANY) chain to find socket not bound
to a particular IP.
-> No extra cost for typical setups, where the first lookup will probabbly
be performed.
RCU lookups everywhere, we dont acquire spinlock.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-11-08 13:18:11 +03:00
|
|
|
if (get_nulls_value(node) != slot)
|
2008-11-17 06:39:21 +03:00
|
|
|
goto begin;
|
|
|
|
|
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
|
|
|
if (result) {
|
|
|
|
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
|
|
|
|
result = NULL;
|
|
|
|
else if (unlikely(compute_score(result, net, saddr, hnum, sport,
|
|
|
|
daddr, dport, dif) < badness)) {
|
|
|
|
sock_put(result);
|
|
|
|
goto begin;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
2008-03-07 03:22:02 +03:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2008-10-07 23:38:32 +04:00
|
|
|
static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
|
|
|
|
__be16 sport, __be16 dport,
|
2008-10-29 11:41:45 +03:00
|
|
|
struct udp_table *udptable)
|
2008-10-07 23:38:32 +04:00
|
|
|
{
|
2008-10-07 23:41:01 +04:00
|
|
|
struct sock *sk;
|
2008-10-07 23:38:32 +04:00
|
|
|
const struct iphdr *iph = ip_hdr(skb);
|
|
|
|
|
2008-10-07 23:41:01 +04:00
|
|
|
if (unlikely(sk = skb_steal_sock(skb)))
|
|
|
|
return sk;
|
|
|
|
else
|
2009-06-02 09:19:30 +04:00
|
|
|
return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
|
2008-10-07 23:41:01 +04:00
|
|
|
iph->daddr, dport, inet_iif(skb),
|
|
|
|
udptable);
|
2008-10-07 23:38:32 +04:00
|
|
|
}
|
|
|
|
|
2008-10-01 18:48:10 +04:00
|
|
|
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
|
|
|
|
__be32 daddr, __be16 dport, int dif)
|
|
|
|
{
|
2008-10-29 11:41:45 +03:00
|
|
|
return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
|
2008-10-01 18:48:10 +04:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
|
|
|
|
|
2008-11-02 07:22:23 +03:00
|
|
|
static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
|
2008-03-07 03:22:02 +03:00
|
|
|
__be16 loc_port, __be32 loc_addr,
|
|
|
|
__be16 rmt_port, __be32 rmt_addr,
|
|
|
|
int dif)
|
|
|
|
{
|
2008-11-17 06:39:21 +03:00
|
|
|
struct hlist_nulls_node *node;
|
2008-03-07 03:22:02 +03:00
|
|
|
struct sock *s = sk;
|
|
|
|
unsigned short hnum = ntohs(loc_port);
|
|
|
|
|
2008-11-17 06:39:21 +03:00
|
|
|
sk_nulls_for_each_from(s, node) {
|
2008-03-07 03:22:02 +03:00
|
|
|
struct inet_sock *inet = inet_sk(s);
|
|
|
|
|
2009-11-23 21:41:23 +03:00
|
|
|
if (!net_eq(sock_net(s), net) ||
|
|
|
|
udp_sk(s)->udp_port_hash != hnum ||
|
|
|
|
(inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
|
|
|
|
(inet->inet_dport != rmt_port && inet->inet_dport) ||
|
|
|
|
(inet->inet_rcv_saddr &&
|
|
|
|
inet->inet_rcv_saddr != loc_addr) ||
|
|
|
|
ipv6_only_sock(s) ||
|
2008-03-07 03:22:02 +03:00
|
|
|
(s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
|
|
|
|
continue;
|
|
|
|
if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
|
|
|
|
continue;
|
|
|
|
goto found;
|
|
|
|
}
|
|
|
|
s = NULL;
|
|
|
|
found:
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This routine is called by the ICMP module when it gets some
|
|
|
|
* sort of error condition. If err < 0 then the socket should
|
|
|
|
* be closed and the error returned to the user. If err > 0
|
|
|
|
* it's just the icmp type << 8 | icmp code.
|
|
|
|
* Header points to the ip header of the error packet. We move
|
|
|
|
* on past this. Then (as it used to claim before adjustment)
|
|
|
|
* header points to the first 8 bytes of the udp header. We need
|
|
|
|
* to find the appropriate port.
|
|
|
|
*/
|
|
|
|
|
2008-10-29 11:41:45 +03:00
|
|
|
void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
|
2008-03-07 03:22:02 +03:00
|
|
|
{
|
|
|
|
struct inet_sock *inet;
|
2009-07-17 04:26:32 +04:00
|
|
|
struct iphdr *iph = (struct iphdr *)skb->data;
|
|
|
|
struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
|
2008-03-07 03:22:02 +03:00
|
|
|
const int type = icmp_hdr(skb)->type;
|
|
|
|
const int code = icmp_hdr(skb)->code;
|
|
|
|
struct sock *sk;
|
|
|
|
int harderr;
|
|
|
|
int err;
|
2008-07-15 10:01:40 +04:00
|
|
|
struct net *net = dev_net(skb->dev);
|
2008-03-07 03:22:02 +03:00
|
|
|
|
2008-07-15 10:01:40 +04:00
|
|
|
sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
|
2008-03-07 03:22:02 +03:00
|
|
|
iph->saddr, uh->source, skb->dev->ifindex, udptable);
|
|
|
|
if (sk == NULL) {
|
2008-07-15 10:03:00 +04:00
|
|
|
ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
|
2008-03-07 03:22:02 +03:00
|
|
|
return; /* No socket for error */
|
|
|
|
}
|
|
|
|
|
|
|
|
err = 0;
|
|
|
|
harderr = 0;
|
|
|
|
inet = inet_sk(sk);
|
|
|
|
|
|
|
|
switch (type) {
|
|
|
|
default:
|
|
|
|
case ICMP_TIME_EXCEEDED:
|
|
|
|
err = EHOSTUNREACH;
|
|
|
|
break;
|
|
|
|
case ICMP_SOURCE_QUENCH:
|
|
|
|
goto out;
|
|
|
|
case ICMP_PARAMETERPROB:
|
|
|
|
err = EPROTO;
|
|
|
|
harderr = 1;
|
|
|
|
break;
|
|
|
|
case ICMP_DEST_UNREACH:
|
|
|
|
if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
|
|
|
|
if (inet->pmtudisc != IP_PMTUDISC_DONT) {
|
|
|
|
err = EMSGSIZE;
|
|
|
|
harderr = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
err = EHOSTUNREACH;
|
|
|
|
if (code <= NR_ICMP_UNREACH) {
|
|
|
|
harderr = icmp_err_convert[code].fatal;
|
|
|
|
err = icmp_err_convert[code].errno;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* RFC1122: OK. Passes ICMP errors back to application, as per
|
|
|
|
* 4.1.3.3.
|
|
|
|
*/
|
|
|
|
if (!inet->recverr) {
|
|
|
|
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
|
|
|
|
goto out;
|
|
|
|
} else {
|
2009-07-17 04:26:32 +04:00
|
|
|
ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
|
2008-03-07 03:22:02 +03:00
|
|
|
}
|
|
|
|
sk->sk_err = err;
|
|
|
|
sk->sk_error_report(sk);
|
|
|
|
out:
|
|
|
|
sock_put(sk);
|
|
|
|
}
|
|
|
|
|
|
|
|
void udp_err(struct sk_buff *skb, u32 info)
|
|
|
|
{
|
2008-10-29 11:41:45 +03:00
|
|
|
__udp4_lib_err(skb, info, &udp_table);
|
2008-03-07 03:22:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Throw away all pending data and cancel the corking. Socket is locked.
|
|
|
|
*/
|
2008-06-04 15:49:07 +04:00
|
|
|
void udp_flush_pending_frames(struct sock *sk)
|
2008-03-07 03:22:02 +03:00
|
|
|
{
|
|
|
|
struct udp_sock *up = udp_sk(sk);
|
|
|
|
|
|
|
|
if (up->pending) {
|
|
|
|
up->len = 0;
|
|
|
|
up->pending = 0;
|
|
|
|
ip_flush_pending_frames(sk);
|
|
|
|
}
|
|
|
|
}
|
2008-06-04 15:49:07 +04:00
|
|
|
EXPORT_SYMBOL(udp_flush_pending_frames);
|
2008-03-07 03:22:02 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* udp4_hwcsum_outgoing - handle outgoing HW checksumming
|
|
|
|
* @sk: socket we are sending on
|
|
|
|
* @skb: sk_buff containing the filled-in UDP header
|
|
|
|
* (checksum field must be zeroed out)
|
|
|
|
*/
|
|
|
|
static void udp4_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
|
2009-07-17 04:26:32 +04:00
|
|
|
__be32 src, __be32 dst, int len)
|
2008-03-07 03:22:02 +03:00
|
|
|
{
|
|
|
|
unsigned int offset;
|
|
|
|
struct udphdr *uh = udp_hdr(skb);
|
|
|
|
__wsum csum = 0;
|
|
|
|
|
|
|
|
if (skb_queue_len(&sk->sk_write_queue) == 1) {
|
|
|
|
/*
|
|
|
|
* Only one fragment on the socket.
|
|
|
|
*/
|
|
|
|
skb->csum_start = skb_transport_header(skb) - skb->head;
|
|
|
|
skb->csum_offset = offsetof(struct udphdr, check);
|
|
|
|
uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* HW-checksum won't work as there are two or more
|
|
|
|
* fragments on the socket so that all csums of sk_buffs
|
|
|
|
* should be together
|
|
|
|
*/
|
|
|
|
offset = skb_transport_offset(skb);
|
|
|
|
skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
|
|
|
|
|
|
|
|
skb->ip_summed = CHECKSUM_NONE;
|
|
|
|
|
|
|
|
skb_queue_walk(&sk->sk_write_queue, skb) {
|
|
|
|
csum = csum_add(csum, skb->csum);
|
|
|
|
}
|
|
|
|
|
|
|
|
uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum);
|
|
|
|
if (uh->check == 0)
|
|
|
|
uh->check = CSUM_MANGLED_0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Push out all pending data as one UDP datagram. Socket is locked.
|
|
|
|
*/
|
|
|
|
static int udp_push_pending_frames(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct udp_sock *up = udp_sk(sk);
|
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
|
|
|
struct flowi *fl = &inet->cork.fl;
|
|
|
|
struct sk_buff *skb;
|
|
|
|
struct udphdr *uh;
|
|
|
|
int err = 0;
|
|
|
|
int is_udplite = IS_UDPLITE(sk);
|
|
|
|
__wsum csum = 0;
|
|
|
|
|
|
|
|
/* Grab the skbuff where UDP header space exists. */
|
|
|
|
if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a UDP header
|
|
|
|
*/
|
|
|
|
uh = udp_hdr(skb);
|
|
|
|
uh->source = fl->fl_ip_sport;
|
|
|
|
uh->dest = fl->fl_ip_dport;
|
|
|
|
uh->len = htons(up->len);
|
|
|
|
uh->check = 0;
|
|
|
|
|
|
|
|
if (is_udplite) /* UDP-Lite */
|
|
|
|
csum = udplite_csum_outgoing(sk, skb);
|
|
|
|
|
|
|
|
else if (sk->sk_no_check == UDP_CSUM_NOXMIT) { /* UDP csum disabled */
|
|
|
|
|
|
|
|
skb->ip_summed = CHECKSUM_NONE;
|
|
|
|
goto send;
|
|
|
|
|
|
|
|
} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
|
|
|
|
|
2009-07-17 04:26:32 +04:00
|
|
|
udp4_hwcsum_outgoing(sk, skb, fl->fl4_src, fl->fl4_dst, up->len);
|
2008-03-07 03:22:02 +03:00
|
|
|
goto send;
|
|
|
|
|
|
|
|
} else /* `normal' UDP */
|
|
|
|
csum = udp_csum_outgoing(sk, skb);
|
|
|
|
|
|
|
|
/* add protocol-dependent pseudo-header */
|
|
|
|
uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, up->len,
|
2009-07-17 04:26:32 +04:00
|
|
|
sk->sk_protocol, csum);
|
2008-03-07 03:22:02 +03:00
|
|
|
if (uh->check == 0)
|
|
|
|
uh->check = CSUM_MANGLED_0;
|
|
|
|
|
|
|
|
send:
|
|
|
|
err = ip_push_pending_frames(sk);
|
ip: Report qdisc packet drops
Christoph Lameter pointed out that packet drops at qdisc level where not
accounted in SNMP counters. Only if application sets IP_RECVERR, drops
are reported to user (-ENOBUFS errors) and SNMP counters updated.
IP_RECVERR is used to enable extended reliable error message passing,
but these are not needed to update system wide SNMP stats.
This patch changes things a bit to allow SNMP counters to be updated,
regardless of IP_RECVERR being set or not on the socket.
Example after an UDP tx flood
# netstat -s
...
IP:
1487048 outgoing packets dropped
...
Udp:
...
SndbufErrors: 1487048
send() syscalls, do however still return an OK status, to not
break applications.
Note : send() manual page explicitly says for -ENOBUFS error :
"The output queue for a network interface was full.
This generally indicates that the interface has stopped sending,
but may be caused by transient congestion.
(Normally, this does not occur in Linux. Packets are just silently
dropped when a device queue overflows.) "
This is not true for IP_RECVERR enabled sockets : a send() syscall
that hit a qdisc drop returns an ENOBUFS error.
Many thanks to Christoph, David, and last but not least, Alexey !
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-09-03 05:05:33 +04:00
|
|
|
if (err) {
|
|
|
|
if (err == -ENOBUFS && !inet->recverr) {
|
|
|
|
UDP_INC_STATS_USER(sock_net(sk),
|
|
|
|
UDP_MIB_SNDBUFERRORS, is_udplite);
|
|
|
|
err = 0;
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
UDP_INC_STATS_USER(sock_net(sk),
|
|
|
|
UDP_MIB_OUTDATAGRAMS, is_udplite);
|
2008-03-07 03:22:02 +03:00
|
|
|
out:
|
|
|
|
up->len = 0;
|
|
|
|
up->pending = 0;
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
|
|
|
|
size_t len)
|
|
|
|
{
|
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
|
|
|
struct udp_sock *up = udp_sk(sk);
|
|
|
|
int ulen = len;
|
|
|
|
struct ipcm_cookie ipc;
|
|
|
|
struct rtable *rt = NULL;
|
|
|
|
int free = 0;
|
|
|
|
int connected = 0;
|
|
|
|
__be32 daddr, faddr, saddr;
|
|
|
|
__be16 dport;
|
|
|
|
u8 tos;
|
|
|
|
int err, is_udplite = IS_UDPLITE(sk);
|
|
|
|
int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
|
|
|
|
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
|
|
|
|
|
|
|
|
if (len > 0xFFFF)
|
|
|
|
return -EMSGSIZE;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check the flags.
|
|
|
|
*/
|
|
|
|
|
2009-07-17 04:26:32 +04:00
|
|
|
if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */
|
2008-03-07 03:22:02 +03:00
|
|
|
return -EOPNOTSUPP;
|
|
|
|
|
|
|
|
ipc.opt = NULL;
|
2009-02-12 08:03:39 +03:00
|
|
|
ipc.shtx.flags = 0;
|
2008-03-07 03:22:02 +03:00
|
|
|
|
|
|
|
if (up->pending) {
|
|
|
|
/*
|
|
|
|
* There are pending frames.
|
|
|
|
* The socket lock must be held while it's corked.
|
|
|
|
*/
|
|
|
|
lock_sock(sk);
|
|
|
|
if (likely(up->pending)) {
|
|
|
|
if (unlikely(up->pending != AF_INET)) {
|
|
|
|
release_sock(sk);
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
goto do_append_data;
|
|
|
|
}
|
|
|
|
release_sock(sk);
|
|
|
|
}
|
|
|
|
ulen += sizeof(struct udphdr);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get and verify the address.
|
|
|
|
*/
|
|
|
|
if (msg->msg_name) {
|
2009-07-17 04:26:32 +04:00
|
|
|
struct sockaddr_in * usin = (struct sockaddr_in *)msg->msg_name;
|
2008-03-07 03:22:02 +03:00
|
|
|
if (msg->msg_namelen < sizeof(*usin))
|
|
|
|
return -EINVAL;
|
|
|
|
if (usin->sin_family != AF_INET) {
|
|
|
|
if (usin->sin_family != AF_UNSPEC)
|
|
|
|
return -EAFNOSUPPORT;
|
|
|
|
}
|
|
|
|
|
|
|
|
daddr = usin->sin_addr.s_addr;
|
|
|
|
dport = usin->sin_port;
|
|
|
|
if (dport == 0)
|
|
|
|
return -EINVAL;
|
|
|
|
} else {
|
|
|
|
if (sk->sk_state != TCP_ESTABLISHED)
|
|
|
|
return -EDESTADDRREQ;
|
2009-10-15 10:30:45 +04:00
|
|
|
daddr = inet->inet_daddr;
|
|
|
|
dport = inet->inet_dport;
|
2008-03-07 03:22:02 +03:00
|
|
|
/* Open fast path for connected socket.
|
|
|
|
Route will not be used, if at least one option is set.
|
|
|
|
*/
|
|
|
|
connected = 1;
|
|
|
|
}
|
2009-10-15 10:30:45 +04:00
|
|
|
ipc.addr = inet->inet_saddr;
|
2008-03-07 03:22:02 +03:00
|
|
|
|
|
|
|
ipc.oif = sk->sk_bound_dev_if;
|
2009-02-12 08:03:39 +03:00
|
|
|
err = sock_tx_timestamp(msg, sk, &ipc.shtx);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2008-03-07 03:22:02 +03:00
|
|
|
if (msg->msg_controllen) {
|
2008-03-25 20:26:21 +03:00
|
|
|
err = ip_cmsg_send(sock_net(sk), msg, &ipc);
|
2008-03-07 03:22:02 +03:00
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
if (ipc.opt)
|
|
|
|
free = 1;
|
|
|
|
connected = 0;
|
|
|
|
}
|
|
|
|
if (!ipc.opt)
|
|
|
|
ipc.opt = inet->opt;
|
|
|
|
|
|
|
|
saddr = ipc.addr;
|
|
|
|
ipc.addr = faddr = daddr;
|
|
|
|
|
|
|
|
if (ipc.opt && ipc.opt->srr) {
|
|
|
|
if (!daddr)
|
|
|
|
return -EINVAL;
|
|
|
|
faddr = ipc.opt->faddr;
|
|
|
|
connected = 0;
|
|
|
|
}
|
|
|
|
tos = RT_TOS(inet->tos);
|
|
|
|
if (sock_flag(sk, SOCK_LOCALROUTE) ||
|
|
|
|
(msg->msg_flags & MSG_DONTROUTE) ||
|
|
|
|
(ipc.opt && ipc.opt->is_strictroute)) {
|
|
|
|
tos |= RTO_ONLINK;
|
|
|
|
connected = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ipv4_is_multicast(daddr)) {
|
|
|
|
if (!ipc.oif)
|
|
|
|
ipc.oif = inet->mc_index;
|
|
|
|
if (!saddr)
|
|
|
|
saddr = inet->mc_addr;
|
|
|
|
connected = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (connected)
|
2009-07-17 04:26:32 +04:00
|
|
|
rt = (struct rtable *)sk_dst_check(sk, 0);
|
2008-03-07 03:22:02 +03:00
|
|
|
|
|
|
|
if (rt == NULL) {
|
|
|
|
struct flowi fl = { .oif = ipc.oif,
|
2009-10-02 02:16:49 +04:00
|
|
|
.mark = sk->sk_mark,
|
2008-03-07 03:22:02 +03:00
|
|
|
.nl_u = { .ip4_u =
|
|
|
|
{ .daddr = faddr,
|
|
|
|
.saddr = saddr,
|
|
|
|
.tos = tos } },
|
|
|
|
.proto = sk->sk_protocol,
|
2008-11-20 12:07:24 +03:00
|
|
|
.flags = inet_sk_flowi_flags(sk),
|
2008-03-07 03:22:02 +03:00
|
|
|
.uli_u = { .ports =
|
2009-10-15 10:30:45 +04:00
|
|
|
{ .sport = inet->inet_sport,
|
2008-03-07 03:22:02 +03:00
|
|
|
.dport = dport } } };
|
2008-07-17 07:19:08 +04:00
|
|
|
struct net *net = sock_net(sk);
|
|
|
|
|
2008-03-07 03:22:02 +03:00
|
|
|
security_sk_classify_flow(sk, &fl);
|
2008-07-17 07:19:08 +04:00
|
|
|
err = ip_route_output_flow(net, &rt, &fl, sk, 1);
|
2008-03-07 03:22:02 +03:00
|
|
|
if (err) {
|
|
|
|
if (err == -ENETUNREACH)
|
2008-07-17 07:20:11 +04:00
|
|
|
IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
|
2008-03-07 03:22:02 +03:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
err = -EACCES;
|
|
|
|
if ((rt->rt_flags & RTCF_BROADCAST) &&
|
|
|
|
!sock_flag(sk, SOCK_BROADCAST))
|
|
|
|
goto out;
|
|
|
|
if (connected)
|
|
|
|
sk_dst_set(sk, dst_clone(&rt->u.dst));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (msg->msg_flags&MSG_CONFIRM)
|
|
|
|
goto do_confirm;
|
|
|
|
back_from_confirm:
|
|
|
|
|
|
|
|
saddr = rt->rt_src;
|
|
|
|
if (!ipc.addr)
|
|
|
|
daddr = ipc.addr = rt->rt_dst;
|
|
|
|
|
|
|
|
lock_sock(sk);
|
|
|
|
if (unlikely(up->pending)) {
|
|
|
|
/* The socket is already corked while preparing it. */
|
|
|
|
/* ... which is an evident application bug. --ANK */
|
|
|
|
release_sock(sk);
|
|
|
|
|
|
|
|
LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n");
|
|
|
|
err = -EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Now cork the socket to pend data.
|
|
|
|
*/
|
|
|
|
inet->cork.fl.fl4_dst = daddr;
|
|
|
|
inet->cork.fl.fl_ip_dport = dport;
|
|
|
|
inet->cork.fl.fl4_src = saddr;
|
2009-10-15 10:30:45 +04:00
|
|
|
inet->cork.fl.fl_ip_sport = inet->inet_sport;
|
2008-03-07 03:22:02 +03:00
|
|
|
up->pending = AF_INET;
|
|
|
|
|
|
|
|
do_append_data:
|
|
|
|
up->len += ulen;
|
|
|
|
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
|
|
|
|
err = ip_append_data(sk, getfrag, msg->msg_iov, ulen,
|
2008-11-25 02:52:46 +03:00
|
|
|
sizeof(struct udphdr), &ipc, &rt,
|
2008-03-07 03:22:02 +03:00
|
|
|
corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
|
|
|
|
if (err)
|
|
|
|
udp_flush_pending_frames(sk);
|
|
|
|
else if (!corkreq)
|
|
|
|
err = udp_push_pending_frames(sk);
|
|
|
|
else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
|
|
|
|
up->pending = 0;
|
|
|
|
release_sock(sk);
|
|
|
|
|
|
|
|
out:
|
|
|
|
ip_rt_put(rt);
|
|
|
|
if (free)
|
|
|
|
kfree(ipc.opt);
|
|
|
|
if (!err)
|
|
|
|
return len;
|
|
|
|
/*
|
|
|
|
* ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting
|
|
|
|
* ENOBUFS might not be good (it's not tunable per se), but otherwise
|
|
|
|
* we don't have a good statistic (IpOutDiscards but it can be too many
|
|
|
|
* things). We could add another new stat but at least for now that
|
|
|
|
* seems like overkill.
|
|
|
|
*/
|
|
|
|
if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
|
2008-07-06 08:18:07 +04:00
|
|
|
UDP_INC_STATS_USER(sock_net(sk),
|
|
|
|
UDP_MIB_SNDBUFERRORS, is_udplite);
|
2008-03-07 03:22:02 +03:00
|
|
|
}
|
|
|
|
return err;
|
|
|
|
|
|
|
|
do_confirm:
|
|
|
|
dst_confirm(&rt->u.dst);
|
|
|
|
if (!(msg->msg_flags&MSG_PROBE) || len)
|
|
|
|
goto back_from_confirm;
|
|
|
|
err = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
2009-07-17 04:26:32 +04:00
|
|
|
EXPORT_SYMBOL(udp_sendmsg);
|
2008-03-07 03:22:02 +03:00
|
|
|
|
|
|
|
int udp_sendpage(struct sock *sk, struct page *page, int offset,
|
|
|
|
size_t size, int flags)
|
|
|
|
{
|
|
|
|
struct udp_sock *up = udp_sk(sk);
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!up->pending) {
|
|
|
|
struct msghdr msg = { .msg_flags = flags|MSG_MORE };
|
|
|
|
|
|
|
|
/* Call udp_sendmsg to specify destination address which
|
|
|
|
* sendpage interface can't pass.
|
|
|
|
* This will succeed only when the socket is connected.
|
|
|
|
*/
|
|
|
|
ret = udp_sendmsg(NULL, sk, &msg, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
lock_sock(sk);
|
|
|
|
|
|
|
|
if (unlikely(!up->pending)) {
|
|
|
|
release_sock(sk);
|
|
|
|
|
|
|
|
LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = ip_append_page(sk, page, offset, size, flags);
|
|
|
|
if (ret == -EOPNOTSUPP) {
|
|
|
|
release_sock(sk);
|
|
|
|
return sock_no_sendpage(sk->sk_socket, page, offset,
|
|
|
|
size, flags);
|
|
|
|
}
|
|
|
|
if (ret < 0) {
|
|
|
|
udp_flush_pending_frames(sk);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
up->len += size;
|
|
|
|
if (!(up->corkflag || (flags&MSG_MORE)))
|
|
|
|
ret = udp_push_pending_frames(sk);
|
|
|
|
if (!ret)
|
|
|
|
ret = size;
|
|
|
|
out:
|
|
|
|
release_sock(sk);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2009-10-09 08:43:40 +04:00
|
|
|
|
|
|
|
/**
|
|
|
|
* first_packet_length - return length of first packet in receive queue
|
|
|
|
* @sk: socket
|
|
|
|
*
|
|
|
|
* Drops all bad checksum frames, until a valid one is found.
|
|
|
|
* Returns the length of found skb, or 0 if none is found.
|
|
|
|
*/
|
|
|
|
static unsigned int first_packet_length(struct sock *sk)
|
|
|
|
{
|
|
|
|
struct sk_buff_head list_kill, *rcvq = &sk->sk_receive_queue;
|
|
|
|
struct sk_buff *skb;
|
|
|
|
unsigned int res;
|
|
|
|
|
|
|
|
__skb_queue_head_init(&list_kill);
|
|
|
|
|
|
|
|
spin_lock_bh(&rcvq->lock);
|
|
|
|
while ((skb = skb_peek(rcvq)) != NULL &&
|
|
|
|
udp_lib_checksum_complete(skb)) {
|
|
|
|
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
|
|
|
|
IS_UDPLITE(sk));
|
2009-10-15 04:12:40 +04:00
|
|
|
atomic_inc(&sk->sk_drops);
|
2009-10-09 08:43:40 +04:00
|
|
|
__skb_unlink(skb, rcvq);
|
|
|
|
__skb_queue_tail(&list_kill, skb);
|
|
|
|
}
|
|
|
|
res = skb ? skb->len : 0;
|
|
|
|
spin_unlock_bh(&rcvq->lock);
|
|
|
|
|
|
|
|
if (!skb_queue_empty(&list_kill)) {
|
2010-04-29 01:35:48 +04:00
|
|
|
lock_sock_bh(sk);
|
2009-10-09 08:43:40 +04:00
|
|
|
__skb_queue_purge(&list_kill);
|
|
|
|
sk_mem_reclaim_partial(sk);
|
2010-04-29 01:35:48 +04:00
|
|
|
unlock_sock_bh(sk);
|
2009-10-09 08:43:40 +04:00
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* IOCTL requests applicable to the UDP protocol
|
|
|
|
*/
|
2007-02-09 17:24:47 +03:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
|
|
|
|
{
|
2007-03-09 07:41:55 +03:00
|
|
|
switch (cmd) {
|
|
|
|
case SIOCOUTQ:
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2009-06-18 06:05:41 +04:00
|
|
|
int amount = sk_wmem_alloc_get(sk);
|
|
|
|
|
2007-03-09 07:41:55 +03:00
|
|
|
return put_user(amount, (int __user *)arg);
|
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2007-03-09 07:41:55 +03:00
|
|
|
case SIOCINQ:
|
|
|
|
{
|
2009-10-09 08:43:40 +04:00
|
|
|
unsigned int amount = first_packet_length(sk);
|
2007-03-09 07:41:55 +03:00
|
|
|
|
2009-10-09 08:43:40 +04:00
|
|
|
if (amount)
|
2007-03-09 07:41:55 +03:00
|
|
|
/*
|
|
|
|
* We will only return the amount
|
|
|
|
* of this packet since that is all
|
|
|
|
* that will be read.
|
|
|
|
*/
|
2009-10-09 08:43:40 +04:00
|
|
|
amount -= sizeof(struct udphdr);
|
|
|
|
|
2007-03-09 07:41:55 +03:00
|
|
|
return put_user(amount, (int __user *)arg);
|
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2007-03-09 07:41:55 +03:00
|
|
|
default:
|
|
|
|
return -ENOIOCTLCMD;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2007-03-09 07:41:55 +03:00
|
|
|
|
|
|
|
return 0;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2009-07-17 04:26:32 +04:00
|
|
|
EXPORT_SYMBOL(udp_ioctl);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2008-03-07 03:22:02 +03:00
|
|
|
/*
|
|
|
|
* This should be easy, if there is something there we
|
|
|
|
* return it, otherwise we block.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
|
|
|
|
size_t len, int noblock, int flags, int *addr_len)
|
|
|
|
{
|
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
|
|
|
struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
|
|
|
|
struct sk_buff *skb;
|
2010-02-10 23:26:19 +03:00
|
|
|
unsigned int ulen;
|
2008-03-07 03:22:02 +03:00
|
|
|
int peeked;
|
|
|
|
int err;
|
|
|
|
int is_udplite = IS_UDPLITE(sk);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check any passed addresses
|
|
|
|
*/
|
|
|
|
if (addr_len)
|
2009-07-17 04:26:32 +04:00
|
|
|
*addr_len = sizeof(*sin);
|
2008-03-07 03:22:02 +03:00
|
|
|
|
|
|
|
if (flags & MSG_ERRQUEUE)
|
|
|
|
return ip_recv_error(sk, msg, len);
|
|
|
|
|
|
|
|
try_again:
|
|
|
|
skb = __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
|
|
|
|
&peeked, &err);
|
|
|
|
if (!skb)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ulen = skb->len - sizeof(struct udphdr);
|
2010-02-10 23:26:19 +03:00
|
|
|
if (len > ulen)
|
|
|
|
len = ulen;
|
|
|
|
else if (len < ulen)
|
2008-03-07 03:22:02 +03:00
|
|
|
msg->msg_flags |= MSG_TRUNC;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If checksum is needed at all, try to do it while copying the
|
|
|
|
* data. If the data is truncated, or if we only want a partial
|
|
|
|
* coverage checksum (UDP-Lite), do it before the copy.
|
|
|
|
*/
|
|
|
|
|
2010-02-10 23:26:19 +03:00
|
|
|
if (len < ulen || UDP_SKB_CB(skb)->partial_cov) {
|
2008-03-07 03:22:02 +03:00
|
|
|
if (udp_lib_checksum_complete(skb))
|
|
|
|
goto csum_copy_err;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (skb_csum_unnecessary(skb))
|
|
|
|
err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
|
2010-02-10 23:26:19 +03:00
|
|
|
msg->msg_iov, len);
|
2008-03-07 03:22:02 +03:00
|
|
|
else {
|
2009-07-17 04:26:32 +04:00
|
|
|
err = skb_copy_and_csum_datagram_iovec(skb,
|
|
|
|
sizeof(struct udphdr),
|
|
|
|
msg->msg_iov);
|
2008-03-07 03:22:02 +03:00
|
|
|
|
|
|
|
if (err == -EINVAL)
|
|
|
|
goto csum_copy_err;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (err)
|
|
|
|
goto out_free;
|
|
|
|
|
|
|
|
if (!peeked)
|
2008-07-06 08:18:07 +04:00
|
|
|
UDP_INC_STATS_USER(sock_net(sk),
|
|
|
|
UDP_MIB_INDATAGRAMS, is_udplite);
|
2008-03-07 03:22:02 +03:00
|
|
|
|
net: Generalize socket rx gap / receive queue overflow cmsg
Create a new socket level option to report number of queue overflows
Recently I augmented the AF_PACKET protocol to report the number of frames lost
on the socket receive queue between any two enqueued frames. This value was
exported via a SOL_PACKET level cmsg. AFter I completed that work it was
requested that this feature be generalized so that any datagram oriented socket
could make use of this option. As such I've created this patch, It creates a
new SOL_SOCKET level option called SO_RXQ_OVFL, which when enabled exports a
SOL_SOCKET level cmsg that reports the nubmer of times the sk_receive_queue
overflowed between any two given frames. It also augments the AF_PACKET
protocol to take advantage of this new feature (as it previously did not touch
sk->sk_drops, which this patch uses to record the overflow count). Tested
successfully by me.
Notes:
1) Unlike my previous patch, this patch simply records the sk_drops value, which
is not a number of drops between packets, but rather a total number of drops.
Deltas must be computed in user space.
2) While this patch currently works with datagram oriented protocols, it will
also be accepted by non-datagram oriented protocols. I'm not sure if thats
agreeable to everyone, but my argument in favor of doing so is that, for those
protocols which aren't applicable to this option, sk_drops will always be zero,
and reporting no drops on a receive queue that isn't used for those
non-participating protocols seems reasonable to me. This also saves us having
to code in a per-protocol opt in mechanism.
3) This applies cleanly to net-next assuming that commit
977750076d98c7ff6cbda51858bb5a5894a9d9ab (my af packet cmsg patch) is reverted
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2009-10-13 00:26:31 +04:00
|
|
|
sock_recv_ts_and_drops(msg, sk, skb);
|
2008-03-07 03:22:02 +03:00
|
|
|
|
|
|
|
/* Copy the address. */
|
2009-07-17 04:26:32 +04:00
|
|
|
if (sin) {
|
2008-03-07 03:22:02 +03:00
|
|
|
sin->sin_family = AF_INET;
|
|
|
|
sin->sin_port = udp_hdr(skb)->source;
|
|
|
|
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
|
|
|
|
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
|
|
|
|
}
|
|
|
|
if (inet->cmsg_flags)
|
|
|
|
ip_cmsg_recv(msg, skb);
|
|
|
|
|
2010-02-10 23:26:19 +03:00
|
|
|
err = len;
|
2008-03-07 03:22:02 +03:00
|
|
|
if (flags & MSG_TRUNC)
|
|
|
|
err = ulen;
|
|
|
|
|
|
|
|
out_free:
|
2009-10-30 08:03:53 +03:00
|
|
|
skb_free_datagram_locked(sk, skb);
|
2008-03-07 03:22:02 +03:00
|
|
|
out:
|
|
|
|
return err;
|
|
|
|
|
|
|
|
csum_copy_err:
|
2010-04-29 01:35:48 +04:00
|
|
|
lock_sock_bh(sk);
|
2008-03-07 03:22:02 +03:00
|
|
|
if (!skb_kill_datagram(sk, skb, flags))
|
2008-07-06 08:18:07 +04:00
|
|
|
UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
|
2010-04-29 01:35:48 +04:00
|
|
|
unlock_sock_bh(sk);
|
2008-03-07 03:22:02 +03:00
|
|
|
|
|
|
|
if (noblock)
|
|
|
|
return -EAGAIN;
|
|
|
|
goto try_again;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
int udp_disconnect(struct sock *sk, int flags)
|
|
|
|
{
|
|
|
|
struct inet_sock *inet = inet_sk(sk);
|
|
|
|
/*
|
|
|
|
* 1003.1g - break association.
|
|
|
|
*/
|
2007-02-09 17:24:47 +03:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
sk->sk_state = TCP_CLOSE;
|
2009-10-15 10:30:45 +04:00
|
|
|
inet->inet_daddr = 0;
|
|
|
|
inet->inet_dport = 0;
|
2010-04-28 02:05:31 +04:00
|
|
|
sock_rps_save_rxhash(sk, 0);
|
2005-04-17 02:20:36 +04:00
|
|
|
sk->sk_bound_dev_if = 0;
|
|
|
|
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
|
|
|
|
inet_reset_saddr(sk);
|
|
|
|
|
|
|
|
if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
|
|
|
|
sk->sk_prot->unhash(sk);
|
2009-10-15 10:30:45 +04:00
|
|
|
inet->inet_sport = 0;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
sk_dst_reset(sk);
|
|
|
|
return 0;
|
|
|
|
}
|
2009-07-17 04:26:32 +04:00
|
|
|
EXPORT_SYMBOL(udp_disconnect);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2008-10-29 11:41:45 +03:00
|
|
|
void udp_lib_unhash(struct sock *sk)
|
|
|
|
{
|
2008-11-26 00:55:15 +03:00
|
|
|
if (sk_hashed(sk)) {
|
|
|
|
struct udp_table *udptable = sk->sk_prot->h.udp_table;
|
2009-11-08 13:17:58 +03:00
|
|
|
struct udp_hslot *hslot, *hslot2;
|
|
|
|
|
|
|
|
hslot = udp_hashslot(udptable, sock_net(sk),
|
|
|
|
udp_sk(sk)->udp_port_hash);
|
|
|
|
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
|
2008-10-29 11:41:45 +03:00
|
|
|
|
2008-11-26 00:55:15 +03:00
|
|
|
spin_lock_bh(&hslot->lock);
|
|
|
|
if (sk_nulls_del_node_init_rcu(sk)) {
|
2009-11-08 13:17:05 +03:00
|
|
|
hslot->count--;
|
2009-10-15 10:30:45 +04:00
|
|
|
inet_sk(sk)->inet_num = 0;
|
2008-11-26 00:55:15 +03:00
|
|
|
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
|
2009-11-08 13:17:58 +03:00
|
|
|
|
|
|
|
spin_lock(&hslot2->lock);
|
|
|
|
hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
|
|
|
|
hslot2->count--;
|
|
|
|
spin_unlock(&hslot2->lock);
|
2008-11-26 00:55:15 +03:00
|
|
|
}
|
|
|
|
spin_unlock_bh(&hslot->lock);
|
2008-10-29 11:41:45 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(udp_lib_unhash);
|
|
|
|
|
2008-09-15 22:48:46 +04:00
|
|
|
static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
|
|
|
|
{
|
rfs: Receive Flow Steering
This patch implements receive flow steering (RFS). RFS steers
received packets for layer 3 and 4 processing to the CPU where
the application for the corresponding flow is running. RFS is an
extension of Receive Packet Steering (RPS).
The basic idea of RFS is that when an application calls recvmsg
(or sendmsg) the application's running CPU is stored in a hash
table that is indexed by the connection's rxhash which is stored in
the socket structure. The rxhash is passed in skb's received on
the connection from netif_receive_skb. For each received packet,
the associated rxhash is used to look up the CPU in the hash table,
if a valid CPU is set then the packet is steered to that CPU using
the RPS mechanisms.
The convolution of the simple approach is that it would potentially
allow OOO packets. If threads are thrashing around CPUs or multiple
threads are trying to read from the same sockets, a quickly changing
CPU value in the hash table could cause rampant OOO packets--
we consider this a non-starter.
To avoid OOO packets, this solution implements two types of hash
tables: rps_sock_flow_table and rps_dev_flow_table.
rps_sock_table is a global hash table. Each entry is just a CPU
number and it is populated in recvmsg and sendmsg as described above.
This table contains the "desired" CPUs for flows.
rps_dev_flow_table is specific to each device queue. Each entry
contains a CPU and a tail queue counter. The CPU is the "current"
CPU for a matching flow. The tail queue counter holds the value
of a tail queue counter for the associated CPU's backlog queue at
the time of last enqueue for a flow matching the entry.
Each backlog queue has a queue head counter which is incremented
on dequeue, and so a queue tail counter is computed as queue head
count + queue length. When a packet is enqueued on a backlog queue,
the current value of the queue tail counter is saved in the hash
entry of the rps_dev_flow_table.
And now the trick: when selecting the CPU for RPS (get_rps_cpu)
the rps_sock_flow table and the rps_dev_flow table for the RX queue
are consulted. When the desired CPU for the flow (found in the
rps_sock_flow table) does not match the current CPU (found in the
rps_dev_flow table), the current CPU is changed to the desired CPU
if one of the following is true:
- The current CPU is unset (equal to RPS_NO_CPU)
- Current CPU is offline
- The current CPU's queue head counter >= queue tail counter in the
rps_dev_flow table. This checks if the queue tail has advanced
beyond the last packet that was enqueued using this table entry.
This guarantees that all packets queued using this entry have been
dequeued, thus preserving in order delivery.
Making each queue have its own rps_dev_flow table has two advantages:
1) the tail queue counters will be written on each receive, so
keeping the table local to interrupting CPU s good for locality. 2)
this allows lockless access to the table-- the CPU number and queue
tail counter need to be accessed together under mutual exclusion
from netif_receive_skb, we assume that this is only called from
device napi_poll which is non-reentrant.
This patch implements RFS for TCP and connected UDP sockets.
It should be usable for other flow oriented protocols.
There are two configuration parameters for RFS. The
"rps_flow_entries" kernel init parameter sets the number of
entries in the rps_sock_flow_table, the per rxqueue sysfs entry
"rps_flow_cnt" contains the number of entries in the rps_dev_flow
table for the rxqueue. Both are rounded to power of two.
The obvious benefit of RFS (over just RPS) is that it achieves
CPU locality between the receive processing for a flow and the
applications processing; this can result in increased performance
(higher pps, lower latency).
The benefits of RFS are dependent on cache hierarchy, application
load, and other factors. On simple benchmarks, we don't necessarily
see improvement and sometimes see degradation. However, for more
complex benchmarks and for applications where cache pressure is
much higher this technique seems to perform very well.
Below are some benchmark results which show the potential benfit of
this patch. The netperf test has 500 instances of netperf TCP_RR
test with 1 byte req. and resp. The RPC test is an request/response
test similar in structure to netperf RR test ith 100 threads on
each host, but does more work in userspace that netperf.
e1000e on 8 core Intel
No RFS or RPS 104K tps at 30% CPU
No RFS (best RPS config): 290K tps at 63% CPU
RFS 303K tps at 61% CPU
RPC test tps CPU% 50/90/99% usec latency Latency StdDev
No RFS/RPS 103K 48% 757/900/3185 4472.35
RPS only: 174K 73% 415/993/2468 491.66
RFS 223K 73% 379/651/1382 315.61
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-04-17 03:01:27 +04:00
|
|
|
int rc;
|
2009-10-15 07:40:11 +04:00
|
|
|
|
rfs: Receive Flow Steering
This patch implements receive flow steering (RFS). RFS steers
received packets for layer 3 and 4 processing to the CPU where
the application for the corresponding flow is running. RFS is an
extension of Receive Packet Steering (RPS).
The basic idea of RFS is that when an application calls recvmsg
(or sendmsg) the application's running CPU is stored in a hash
table that is indexed by the connection's rxhash which is stored in
the socket structure. The rxhash is passed in skb's received on
the connection from netif_receive_skb. For each received packet,
the associated rxhash is used to look up the CPU in the hash table,
if a valid CPU is set then the packet is steered to that CPU using
the RPS mechanisms.
The convolution of the simple approach is that it would potentially
allow OOO packets. If threads are thrashing around CPUs or multiple
threads are trying to read from the same sockets, a quickly changing
CPU value in the hash table could cause rampant OOO packets--
we consider this a non-starter.
To avoid OOO packets, this solution implements two types of hash
tables: rps_sock_flow_table and rps_dev_flow_table.
rps_sock_table is a global hash table. Each entry is just a CPU
number and it is populated in recvmsg and sendmsg as described above.
This table contains the "desired" CPUs for flows.
rps_dev_flow_table is specific to each device queue. Each entry
contains a CPU and a tail queue counter. The CPU is the "current"
CPU for a matching flow. The tail queue counter holds the value
of a tail queue counter for the associated CPU's backlog queue at
the time of last enqueue for a flow matching the entry.
Each backlog queue has a queue head counter which is incremented
on dequeue, and so a queue tail counter is computed as queue head
count + queue length. When a packet is enqueued on a backlog queue,
the current value of the queue tail counter is saved in the hash
entry of the rps_dev_flow_table.
And now the trick: when selecting the CPU for RPS (get_rps_cpu)
the rps_sock_flow table and the rps_dev_flow table for the RX queue
are consulted. When the desired CPU for the flow (found in the
rps_sock_flow table) does not match the current CPU (found in the
rps_dev_flow table), the current CPU is changed to the desired CPU
if one of the following is true:
- The current CPU is unset (equal to RPS_NO_CPU)
- Current CPU is offline
- The current CPU's queue head counter >= queue tail counter in the
rps_dev_flow table. This checks if the queue tail has advanced
beyond the last packet that was enqueued using this table entry.
This guarantees that all packets queued using this entry have been
dequeued, thus preserving in order delivery.
Making each queue have its own rps_dev_flow table has two advantages:
1) the tail queue counters will be written on each receive, so
keeping the table local to interrupting CPU s good for locality. 2)
this allows lockless access to the table-- the CPU number and queue
tail counter need to be accessed together under mutual exclusion
from netif_receive_skb, we assume that this is only called from
device napi_poll which is non-reentrant.
This patch implements RFS for TCP and connected UDP sockets.
It should be usable for other flow oriented protocols.
There are two configuration parameters for RFS. The
"rps_flow_entries" kernel init parameter sets the number of
entries in the rps_sock_flow_table, the per rxqueue sysfs entry
"rps_flow_cnt" contains the number of entries in the rps_dev_flow
table for the rxqueue. Both are rounded to power of two.
The obvious benefit of RFS (over just RPS) is that it achieves
CPU locality between the receive processing for a flow and the
applications processing; this can result in increased performance
(higher pps, lower latency).
The benefits of RFS are dependent on cache hierarchy, application
load, and other factors. On simple benchmarks, we don't necessarily
see improvement and sometimes see degradation. However, for more
complex benchmarks and for applications where cache pressure is
much higher this technique seems to perform very well.
Below are some benchmark results which show the potential benfit of
this patch. The netperf test has 500 instances of netperf TCP_RR
test with 1 byte req. and resp. The RPC test is an request/response
test similar in structure to netperf RR test ith 100 threads on
each host, but does more work in userspace that netperf.
e1000e on 8 core Intel
No RFS or RPS 104K tps at 30% CPU
No RFS (best RPS config): 290K tps at 63% CPU
RFS 303K tps at 61% CPU
RPC test tps CPU% 50/90/99% usec latency Latency StdDev
No RFS/RPS 103K 48% 757/900/3185 4472.35
RPS only: 174K 73% 415/993/2468 491.66
RFS 223K 73% 379/651/1382 315.61
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-04-17 03:01:27 +04:00
|
|
|
if (inet_sk(sk)->inet_daddr)
|
2010-04-28 02:05:31 +04:00
|
|
|
sock_rps_save_rxhash(sk, skb->rxhash);
|
rfs: Receive Flow Steering
This patch implements receive flow steering (RFS). RFS steers
received packets for layer 3 and 4 processing to the CPU where
the application for the corresponding flow is running. RFS is an
extension of Receive Packet Steering (RPS).
The basic idea of RFS is that when an application calls recvmsg
(or sendmsg) the application's running CPU is stored in a hash
table that is indexed by the connection's rxhash which is stored in
the socket structure. The rxhash is passed in skb's received on
the connection from netif_receive_skb. For each received packet,
the associated rxhash is used to look up the CPU in the hash table,
if a valid CPU is set then the packet is steered to that CPU using
the RPS mechanisms.
The convolution of the simple approach is that it would potentially
allow OOO packets. If threads are thrashing around CPUs or multiple
threads are trying to read from the same sockets, a quickly changing
CPU value in the hash table could cause rampant OOO packets--
we consider this a non-starter.
To avoid OOO packets, this solution implements two types of hash
tables: rps_sock_flow_table and rps_dev_flow_table.
rps_sock_table is a global hash table. Each entry is just a CPU
number and it is populated in recvmsg and sendmsg as described above.
This table contains the "desired" CPUs for flows.
rps_dev_flow_table is specific to each device queue. Each entry
contains a CPU and a tail queue counter. The CPU is the "current"
CPU for a matching flow. The tail queue counter holds the value
of a tail queue counter for the associated CPU's backlog queue at
the time of last enqueue for a flow matching the entry.
Each backlog queue has a queue head counter which is incremented
on dequeue, and so a queue tail counter is computed as queue head
count + queue length. When a packet is enqueued on a backlog queue,
the current value of the queue tail counter is saved in the hash
entry of the rps_dev_flow_table.
And now the trick: when selecting the CPU for RPS (get_rps_cpu)
the rps_sock_flow table and the rps_dev_flow table for the RX queue
are consulted. When the desired CPU for the flow (found in the
rps_sock_flow table) does not match the current CPU (found in the
rps_dev_flow table), the current CPU is changed to the desired CPU
if one of the following is true:
- The current CPU is unset (equal to RPS_NO_CPU)
- Current CPU is offline
- The current CPU's queue head counter >= queue tail counter in the
rps_dev_flow table. This checks if the queue tail has advanced
beyond the last packet that was enqueued using this table entry.
This guarantees that all packets queued using this entry have been
dequeued, thus preserving in order delivery.
Making each queue have its own rps_dev_flow table has two advantages:
1) the tail queue counters will be written on each receive, so
keeping the table local to interrupting CPU s good for locality. 2)
this allows lockless access to the table-- the CPU number and queue
tail counter need to be accessed together under mutual exclusion
from netif_receive_skb, we assume that this is only called from
device napi_poll which is non-reentrant.
This patch implements RFS for TCP and connected UDP sockets.
It should be usable for other flow oriented protocols.
There are two configuration parameters for RFS. The
"rps_flow_entries" kernel init parameter sets the number of
entries in the rps_sock_flow_table, the per rxqueue sysfs entry
"rps_flow_cnt" contains the number of entries in the rps_dev_flow
table for the rxqueue. Both are rounded to power of two.
The obvious benefit of RFS (over just RPS) is that it achieves
CPU locality between the receive processing for a flow and the
applications processing; this can result in increased performance
(higher pps, lower latency).
The benefits of RFS are dependent on cache hierarchy, application
load, and other factors. On simple benchmarks, we don't necessarily
see improvement and sometimes see degradation. However, for more
complex benchmarks and for applications where cache pressure is
much higher this technique seems to perform very well.
Below are some benchmark results which show the potential benfit of
this patch. The netperf test has 500 instances of netperf TCP_RR
test with 1 byte req. and resp. The RPC test is an request/response
test similar in structure to netperf RR test ith 100 threads on
each host, but does more work in userspace that netperf.
e1000e on 8 core Intel
No RFS or RPS 104K tps at 30% CPU
No RFS (best RPS config): 290K tps at 63% CPU
RFS 303K tps at 61% CPU
RPC test tps CPU% 50/90/99% usec latency Latency StdDev
No RFS/RPS 103K 48% 757/900/3185 4472.35
RPS only: 174K 73% 415/993/2468 491.66
RFS 223K 73% 379/651/1382 315.61
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-04-17 03:01:27 +04:00
|
|
|
|
2010-04-29 02:31:51 +04:00
|
|
|
rc = ip_queue_rcv_skb(sk, skb);
|
2009-10-15 07:40:11 +04:00
|
|
|
if (rc < 0) {
|
|
|
|
int is_udplite = IS_UDPLITE(sk);
|
2008-09-15 22:48:46 +04:00
|
|
|
|
|
|
|
/* Note that an ENOMEM error is charged twice */
|
2009-10-15 07:40:11 +04:00
|
|
|
if (rc == -ENOMEM)
|
2008-09-15 22:48:46 +04:00
|
|
|
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
|
|
|
|
is_udplite);
|
2009-10-15 07:40:11 +04:00
|
|
|
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
|
|
|
|
kfree_skb(skb);
|
|
|
|
return -1;
|
2008-09-15 22:48:46 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2008-03-07 03:22:02 +03:00
|
|
|
/* returns:
|
|
|
|
* -1: error
|
|
|
|
* 0: success
|
|
|
|
* >0: "udp encap" protocol resubmission
|
|
|
|
*
|
|
|
|
* Note that in the success and error cases, the skb is assumed to
|
|
|
|
* have either been requeued or freed.
|
|
|
|
*/
|
2009-07-17 04:26:32 +04:00
|
|
|
int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
|
2008-03-07 03:22:02 +03:00
|
|
|
{
|
|
|
|
struct udp_sock *up = udp_sk(sk);
|
|
|
|
int rc;
|
|
|
|
int is_udplite = IS_UDPLITE(sk);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Charge it to the socket, dropping if the queue is full.
|
|
|
|
*/
|
|
|
|
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
|
|
|
|
goto drop;
|
|
|
|
nf_reset(skb);
|
|
|
|
|
|
|
|
if (up->encap_type) {
|
|
|
|
/*
|
|
|
|
* This is an encapsulation socket so pass the skb to
|
|
|
|
* the socket's udp_encap_rcv() hook. Otherwise, just
|
|
|
|
* fall through and pass this up the UDP socket.
|
|
|
|
* up->encap_rcv() returns the following value:
|
|
|
|
* =0 if skb was successfully passed to the encap
|
|
|
|
* handler or was discarded by it.
|
|
|
|
* >0 if skb should be passed on to UDP.
|
|
|
|
* <0 if skb should be resubmitted as proto -N
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* if we're overly short, let UDP handle it */
|
|
|
|
if (skb->len > sizeof(struct udphdr) &&
|
|
|
|
up->encap_rcv != NULL) {
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = (*up->encap_rcv)(sk, skb);
|
|
|
|
if (ret <= 0) {
|
2008-07-06 08:18:48 +04:00
|
|
|
UDP_INC_STATS_BH(sock_net(sk),
|
|
|
|
UDP_MIB_INDATAGRAMS,
|
2008-03-07 03:22:02 +03:00
|
|
|
is_udplite);
|
|
|
|
return -ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* FALLTHROUGH -- it's a UDP Packet */
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* UDP-Lite specific tests, ignored on UDP sockets
|
|
|
|
*/
|
|
|
|
if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
|
|
|
|
|
|
|
|
/*
|
|
|
|
* MIB statistics other than incrementing the error count are
|
|
|
|
* disabled for the following two types of errors: these depend
|
|
|
|
* on the application settings, not on the functioning of the
|
|
|
|
* protocol stack as such.
|
|
|
|
*
|
|
|
|
* RFC 3828 here recommends (sec 3.3): "There should also be a
|
|
|
|
* way ... to ... at least let the receiving application block
|
|
|
|
* delivery of packets with coverage values less than a value
|
|
|
|
* provided by the application."
|
|
|
|
*/
|
|
|
|
if (up->pcrlen == 0) { /* full coverage was set */
|
|
|
|
LIMIT_NETDEBUG(KERN_WARNING "UDPLITE: partial coverage "
|
|
|
|
"%d while full coverage %d requested\n",
|
|
|
|
UDP_SKB_CB(skb)->cscov, skb->len);
|
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
/* The next case involves violating the min. coverage requested
|
|
|
|
* by the receiver. This is subtle: if receiver wants x and x is
|
|
|
|
* greater than the buffersize/MTU then receiver will complain
|
|
|
|
* that it wants x while sender emits packets of smaller size y.
|
|
|
|
* Therefore the above ...()->partial_cov statement is essential.
|
|
|
|
*/
|
|
|
|
if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
|
|
|
|
LIMIT_NETDEBUG(KERN_WARNING
|
|
|
|
"UDPLITE: coverage %d too small, need min %d\n",
|
|
|
|
UDP_SKB_CB(skb)->cscov, up->pcrlen);
|
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (sk->sk_filter) {
|
|
|
|
if (udp_lib_checksum_complete(skb))
|
|
|
|
goto drop;
|
|
|
|
}
|
|
|
|
|
2010-04-28 02:13:20 +04:00
|
|
|
|
|
|
|
if (sk_rcvqueues_full(sk, skb))
|
|
|
|
goto drop;
|
|
|
|
|
2008-09-15 22:48:46 +04:00
|
|
|
rc = 0;
|
2008-03-07 03:22:02 +03:00
|
|
|
|
2008-09-15 22:48:46 +04:00
|
|
|
bh_lock_sock(sk);
|
|
|
|
if (!sock_owned_by_user(sk))
|
|
|
|
rc = __udp_queue_rcv_skb(sk, skb);
|
2010-03-04 21:01:47 +03:00
|
|
|
else if (sk_add_backlog(sk, skb)) {
|
2010-03-04 21:01:42 +03:00
|
|
|
bh_unlock_sock(sk);
|
|
|
|
goto drop;
|
|
|
|
}
|
2008-09-15 22:48:46 +04:00
|
|
|
bh_unlock_sock(sk);
|
|
|
|
|
|
|
|
return rc;
|
2008-03-07 03:22:02 +03:00
|
|
|
|
|
|
|
drop:
|
2008-07-06 08:18:48 +04:00
|
|
|
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
|
2009-10-15 04:12:40 +04:00
|
|
|
atomic_inc(&sk->sk_drops);
|
2008-03-07 03:22:02 +03:00
|
|
|
kfree_skb(skb);
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
2009-11-08 13:18:44 +03:00
|
|
|
|
|
|
|
static void flush_stack(struct sock **stack, unsigned int count,
|
|
|
|
struct sk_buff *skb, unsigned int final)
|
|
|
|
{
|
|
|
|
unsigned int i;
|
|
|
|
struct sk_buff *skb1 = NULL;
|
2009-11-08 13:20:19 +03:00
|
|
|
struct sock *sk;
|
2009-11-08 13:18:44 +03:00
|
|
|
|
|
|
|
for (i = 0; i < count; i++) {
|
2009-11-08 13:20:19 +03:00
|
|
|
sk = stack[i];
|
2009-11-08 13:18:44 +03:00
|
|
|
if (likely(skb1 == NULL))
|
|
|
|
skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
|
|
|
|
|
2009-11-08 13:20:19 +03:00
|
|
|
if (!skb1) {
|
|
|
|
atomic_inc(&sk->sk_drops);
|
|
|
|
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
|
|
|
|
IS_UDPLITE(sk));
|
|
|
|
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
|
|
|
|
IS_UDPLITE(sk));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
|
2009-11-08 13:18:44 +03:00
|
|
|
skb1 = NULL;
|
|
|
|
}
|
|
|
|
if (unlikely(skb1))
|
|
|
|
kfree_skb(skb1);
|
|
|
|
}
|
|
|
|
|
2008-03-07 03:22:02 +03:00
|
|
|
/*
|
|
|
|
* Multicasts and broadcasts go to each listener.
|
|
|
|
*
|
2009-11-08 13:18:44 +03:00
|
|
|
* Note: called only from the BH handler context.
|
2008-03-07 03:22:02 +03:00
|
|
|
*/
|
2008-06-17 04:12:11 +04:00
|
|
|
static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
|
2008-03-07 03:22:02 +03:00
|
|
|
struct udphdr *uh,
|
|
|
|
__be32 saddr, __be32 daddr,
|
2008-10-29 11:41:45 +03:00
|
|
|
struct udp_table *udptable)
|
2008-03-07 03:22:02 +03:00
|
|
|
{
|
2009-11-08 13:18:44 +03:00
|
|
|
struct sock *sk, *stack[256 / sizeof(struct sock *)];
|
2009-10-07 04:37:59 +04:00
|
|
|
struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
|
2008-03-07 03:22:02 +03:00
|
|
|
int dif;
|
2009-11-08 13:18:44 +03:00
|
|
|
unsigned int i, count = 0;
|
2008-03-07 03:22:02 +03:00
|
|
|
|
2008-10-29 11:41:45 +03:00
|
|
|
spin_lock(&hslot->lock);
|
2008-11-17 06:39:21 +03:00
|
|
|
sk = sk_nulls_head(&hslot->head);
|
2008-03-07 03:22:02 +03:00
|
|
|
dif = skb->dev->ifindex;
|
2008-11-02 07:22:23 +03:00
|
|
|
sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
|
2009-11-08 13:18:44 +03:00
|
|
|
while (sk) {
|
|
|
|
stack[count++] = sk;
|
|
|
|
sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
|
|
|
|
daddr, uh->source, saddr, dif);
|
|
|
|
if (unlikely(count == ARRAY_SIZE(stack))) {
|
|
|
|
if (!sk)
|
|
|
|
break;
|
|
|
|
flush_stack(stack, count, skb, ~0);
|
|
|
|
count = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* before releasing chain lock, we must take a reference on sockets
|
|
|
|
*/
|
|
|
|
for (i = 0; i < count; i++)
|
|
|
|
sock_hold(stack[i]);
|
|
|
|
|
2008-10-29 11:41:45 +03:00
|
|
|
spin_unlock(&hslot->lock);
|
2009-11-08 13:18:44 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* do the slow work with no lock held
|
|
|
|
*/
|
|
|
|
if (count) {
|
|
|
|
flush_stack(stack, count, skb, count - 1);
|
|
|
|
|
|
|
|
for (i = 0; i < count; i++)
|
|
|
|
sock_put(stack[i]);
|
|
|
|
} else {
|
|
|
|
kfree_skb(skb);
|
|
|
|
}
|
2008-03-07 03:22:02 +03:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Initialize UDP checksum. If exited with zero value (success),
|
|
|
|
* CHECKSUM_UNNECESSARY means, that no more checks are required.
|
|
|
|
* Otherwise, csum completion requires chacksumming packet body,
|
|
|
|
* including udp header and folding it to skb->csum.
|
|
|
|
*/
|
|
|
|
static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
|
|
|
|
int proto)
|
|
|
|
{
|
|
|
|
const struct iphdr *iph;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
UDP_SKB_CB(skb)->partial_cov = 0;
|
|
|
|
UDP_SKB_CB(skb)->cscov = skb->len;
|
|
|
|
|
|
|
|
if (proto == IPPROTO_UDPLITE) {
|
|
|
|
err = udplite_checksum_init(skb, uh);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
iph = ip_hdr(skb);
|
|
|
|
if (uh->check == 0) {
|
|
|
|
skb->ip_summed = CHECKSUM_UNNECESSARY;
|
|
|
|
} else if (skb->ip_summed == CHECKSUM_COMPLETE) {
|
2009-07-17 04:26:32 +04:00
|
|
|
if (!csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
|
2008-03-07 03:22:02 +03:00
|
|
|
proto, skb->csum))
|
|
|
|
skb->ip_summed = CHECKSUM_UNNECESSARY;
|
|
|
|
}
|
|
|
|
if (!skb_csum_unnecessary(skb))
|
|
|
|
skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
|
|
|
|
skb->len, proto, 0);
|
|
|
|
/* Probably, we should checksum udp header (it should be in cache
|
|
|
|
* in any case) and data in tiny packets (< rx copybreak).
|
|
|
|
*/
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* All we need to do is get the socket, and then do a checksum.
|
|
|
|
*/
|
|
|
|
|
2008-10-29 11:41:45 +03:00
|
|
|
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
|
2008-03-07 03:22:02 +03:00
|
|
|
int proto)
|
|
|
|
{
|
|
|
|
struct sock *sk;
|
2009-02-06 02:05:45 +03:00
|
|
|
struct udphdr *uh;
|
2008-03-07 03:22:02 +03:00
|
|
|
unsigned short ulen;
|
2009-06-02 09:19:30 +04:00
|
|
|
struct rtable *rt = skb_rtable(skb);
|
2009-02-06 12:59:12 +03:00
|
|
|
__be32 saddr, daddr;
|
2008-07-06 08:18:48 +04:00
|
|
|
struct net *net = dev_net(skb->dev);
|
2008-03-07 03:22:02 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Validate the packet.
|
|
|
|
*/
|
|
|
|
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
|
|
|
|
goto drop; /* No space for header. */
|
|
|
|
|
2009-02-06 02:05:45 +03:00
|
|
|
uh = udp_hdr(skb);
|
2008-03-07 03:22:02 +03:00
|
|
|
ulen = ntohs(uh->len);
|
2010-05-06 07:44:34 +04:00
|
|
|
saddr = ip_hdr(skb)->saddr;
|
|
|
|
daddr = ip_hdr(skb)->daddr;
|
|
|
|
|
2008-03-07 03:22:02 +03:00
|
|
|
if (ulen > skb->len)
|
|
|
|
goto short_packet;
|
|
|
|
|
|
|
|
if (proto == IPPROTO_UDP) {
|
|
|
|
/* UDP validates ulen. */
|
|
|
|
if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
|
|
|
|
goto short_packet;
|
|
|
|
uh = udp_hdr(skb);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (udp4_csum_init(skb, uh, proto))
|
|
|
|
goto csum_error;
|
|
|
|
|
|
|
|
if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
|
2008-06-17 04:12:11 +04:00
|
|
|
return __udp4_lib_mcast_deliver(net, skb, uh,
|
|
|
|
saddr, daddr, udptable);
|
2008-03-07 03:22:02 +03:00
|
|
|
|
2008-10-07 23:38:32 +04:00
|
|
|
sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
|
2008-03-07 03:22:02 +03:00
|
|
|
|
|
|
|
if (sk != NULL) {
|
2008-09-15 22:48:46 +04:00
|
|
|
int ret = udp_queue_rcv_skb(sk, skb);
|
2008-03-07 03:22:02 +03:00
|
|
|
sock_put(sk);
|
|
|
|
|
|
|
|
/* a return value > 0 means to resubmit the input, but
|
|
|
|
* it wants the return to be -protocol, or 0
|
|
|
|
*/
|
|
|
|
if (ret > 0)
|
|
|
|
return -ret;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
|
|
|
|
goto drop;
|
|
|
|
nf_reset(skb);
|
|
|
|
|
|
|
|
/* No socket. Drop packet silently, if checksum is wrong */
|
|
|
|
if (udp_lib_checksum_complete(skb))
|
|
|
|
goto csum_error;
|
|
|
|
|
2008-07-06 08:18:48 +04:00
|
|
|
UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
|
2008-03-07 03:22:02 +03:00
|
|
|
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Hmm. We got an UDP packet to a port to which we
|
|
|
|
* don't wanna listen. Ignore it.
|
|
|
|
*/
|
|
|
|
kfree_skb(skb);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
short_packet:
|
2008-10-31 10:53:57 +03:00
|
|
|
LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
|
2008-03-07 03:22:02 +03:00
|
|
|
proto == IPPROTO_UDPLITE ? "-Lite" : "",
|
2008-10-31 10:53:57 +03:00
|
|
|
&saddr,
|
2008-03-07 03:22:02 +03:00
|
|
|
ntohs(uh->source),
|
|
|
|
ulen,
|
|
|
|
skb->len,
|
2008-10-31 10:53:57 +03:00
|
|
|
&daddr,
|
2008-03-07 03:22:02 +03:00
|
|
|
ntohs(uh->dest));
|
|
|
|
goto drop;
|
|
|
|
|
|
|
|
csum_error:
|
|
|
|
/*
|
|
|
|
* RFC1122: OK. Discards the bad packet silently (as far as
|
|
|
|
* the network is concerned, anyway) as per 4.1.3.4 (MUST).
|
|
|
|
*/
|
2008-10-31 10:53:57 +03:00
|
|
|
LIMIT_NETDEBUG(KERN_DEBUG "UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
|
2008-03-07 03:22:02 +03:00
|
|
|
proto == IPPROTO_UDPLITE ? "-Lite" : "",
|
2008-10-31 10:53:57 +03:00
|
|
|
&saddr,
|
2008-03-07 03:22:02 +03:00
|
|
|
ntohs(uh->source),
|
2008-10-31 10:53:57 +03:00
|
|
|
&daddr,
|
2008-03-07 03:22:02 +03:00
|
|
|
ntohs(uh->dest),
|
|
|
|
ulen);
|
|
|
|
drop:
|
2008-07-06 08:18:48 +04:00
|
|
|
UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
|
2008-03-07 03:22:02 +03:00
|
|
|
kfree_skb(skb);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int udp_rcv(struct sk_buff *skb)
|
|
|
|
{
|
2008-10-29 11:41:45 +03:00
|
|
|
return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
|
2008-03-07 03:22:02 +03:00
|
|
|
}
|
|
|
|
|
2008-06-15 04:04:49 +04:00
|
|
|
void udp_destroy_sock(struct sock *sk)
|
2008-03-07 03:22:02 +03:00
|
|
|
{
|
2010-04-29 01:35:48 +04:00
|
|
|
lock_sock_bh(sk);
|
2008-03-07 03:22:02 +03:00
|
|
|
udp_flush_pending_frames(sk);
|
2010-04-29 01:35:48 +04:00
|
|
|
unlock_sock_bh(sk);
|
2008-03-07 03:22:02 +03:00
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/*
|
|
|
|
* Socket option code for UDP
|
|
|
|
*/
|
2006-11-27 20:29:59 +03:00
|
|
|
int udp_lib_setsockopt(struct sock *sk, int level, int optname,
|
2009-10-01 03:12:20 +04:00
|
|
|
char __user *optval, unsigned int optlen,
|
2006-11-27 20:29:59 +03:00
|
|
|
int (*push_pending_frames)(struct sock *))
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
struct udp_sock *up = udp_sk(sk);
|
|
|
|
int val;
|
|
|
|
int err = 0;
|
2007-12-03 14:34:16 +03:00
|
|
|
int is_udplite = IS_UDPLITE(sk);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2009-07-17 04:26:32 +04:00
|
|
|
if (optlen < sizeof(int))
|
2005-04-17 02:20:36 +04:00
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (get_user(val, (int __user *)optval))
|
|
|
|
return -EFAULT;
|
|
|
|
|
2007-03-09 07:41:55 +03:00
|
|
|
switch (optname) {
|
2005-04-17 02:20:36 +04:00
|
|
|
case UDP_CORK:
|
|
|
|
if (val != 0) {
|
|
|
|
up->corkflag = 1;
|
|
|
|
} else {
|
|
|
|
up->corkflag = 0;
|
|
|
|
lock_sock(sk);
|
2006-11-27 20:29:59 +03:00
|
|
|
(*push_pending_frames)(sk);
|
2005-04-17 02:20:36 +04:00
|
|
|
release_sock(sk);
|
|
|
|
}
|
|
|
|
break;
|
2007-02-09 17:24:47 +03:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
case UDP_ENCAP:
|
|
|
|
switch (val) {
|
|
|
|
case 0:
|
|
|
|
case UDP_ENCAP_ESPINUDP:
|
|
|
|
case UDP_ENCAP_ESPINUDP_NON_IKE:
|
2007-07-06 04:08:05 +04:00
|
|
|
up->encap_rcv = xfrm4_udp_encap_rcv;
|
|
|
|
/* FALLTHROUGH */
|
2007-06-28 02:37:46 +04:00
|
|
|
case UDP_ENCAP_L2TPINUDP:
|
2005-04-17 02:20:36 +04:00
|
|
|
up->encap_type = val;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
err = -ENOPROTOOPT;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2006-11-27 22:10:57 +03:00
|
|
|
/*
|
|
|
|
* UDP-Lite's partial checksum coverage (RFC 3828).
|
|
|
|
*/
|
|
|
|
/* The sender sets actual checksum coverage length via this option.
|
|
|
|
* The case coverage > packet length is handled by send module. */
|
|
|
|
case UDPLITE_SEND_CSCOV:
|
2007-12-03 14:34:16 +03:00
|
|
|
if (!is_udplite) /* Disable the option on UDP sockets */
|
2006-11-27 22:10:57 +03:00
|
|
|
return -ENOPROTOOPT;
|
|
|
|
if (val != 0 && val < 8) /* Illegal coverage: use default (8) */
|
|
|
|
val = 8;
|
2008-07-22 00:35:08 +04:00
|
|
|
else if (val > USHORT_MAX)
|
|
|
|
val = USHORT_MAX;
|
2006-11-27 22:10:57 +03:00
|
|
|
up->pcslen = val;
|
|
|
|
up->pcflag |= UDPLITE_SEND_CC;
|
|
|
|
break;
|
|
|
|
|
2007-02-09 17:24:47 +03:00
|
|
|
/* The receiver specifies a minimum checksum coverage value. To make
|
|
|
|
* sense, this should be set to at least 8 (as done below). If zero is
|
2006-11-27 22:10:57 +03:00
|
|
|
* used, this again means full checksum coverage. */
|
|
|
|
case UDPLITE_RECV_CSCOV:
|
2007-12-03 14:34:16 +03:00
|
|
|
if (!is_udplite) /* Disable the option on UDP sockets */
|
2006-11-27 22:10:57 +03:00
|
|
|
return -ENOPROTOOPT;
|
|
|
|
if (val != 0 && val < 8) /* Avoid silly minimal values. */
|
|
|
|
val = 8;
|
2008-07-22 00:35:08 +04:00
|
|
|
else if (val > USHORT_MAX)
|
|
|
|
val = USHORT_MAX;
|
2006-11-27 22:10:57 +03:00
|
|
|
up->pcrlen = val;
|
|
|
|
up->pcflag |= UDPLITE_RECV_CC;
|
|
|
|
break;
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
default:
|
|
|
|
err = -ENOPROTOOPT;
|
|
|
|
break;
|
2007-03-09 07:41:55 +03:00
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
2009-07-17 04:26:32 +04:00
|
|
|
EXPORT_SYMBOL(udp_lib_setsockopt);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2008-03-07 03:22:02 +03:00
|
|
|
int udp_setsockopt(struct sock *sk, int level, int optname,
|
2009-10-01 03:12:20 +04:00
|
|
|
char __user *optval, unsigned int optlen)
|
2008-03-07 03:22:02 +03:00
|
|
|
{
|
|
|
|
if (level == SOL_UDP || level == SOL_UDPLITE)
|
|
|
|
return udp_lib_setsockopt(sk, level, optname, optval, optlen,
|
|
|
|
udp_push_pending_frames);
|
|
|
|
return ip_setsockopt(sk, level, optname, optval, optlen);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
int compat_udp_setsockopt(struct sock *sk, int level, int optname,
|
2009-10-01 03:12:20 +04:00
|
|
|
char __user *optval, unsigned int optlen)
|
2008-03-07 03:22:02 +03:00
|
|
|
{
|
|
|
|
if (level == SOL_UDP || level == SOL_UDPLITE)
|
|
|
|
return udp_lib_setsockopt(sk, level, optname, optval, optlen,
|
|
|
|
udp_push_pending_frames);
|
|
|
|
return compat_ip_setsockopt(sk, level, optname, optval, optlen);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2006-11-27 20:29:59 +03:00
|
|
|
int udp_lib_getsockopt(struct sock *sk, int level, int optname,
|
|
|
|
char __user *optval, int __user *optlen)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
struct udp_sock *up = udp_sk(sk);
|
|
|
|
int val, len;
|
|
|
|
|
2009-07-17 04:26:32 +04:00
|
|
|
if (get_user(len, optlen))
|
2005-04-17 02:20:36 +04:00
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
len = min_t(unsigned int, len, sizeof(int));
|
2007-02-09 17:24:47 +03:00
|
|
|
|
2007-03-09 07:41:55 +03:00
|
|
|
if (len < 0)
|
2005-04-17 02:20:36 +04:00
|
|
|
return -EINVAL;
|
|
|
|
|
2007-03-09 07:41:55 +03:00
|
|
|
switch (optname) {
|
2005-04-17 02:20:36 +04:00
|
|
|
case UDP_CORK:
|
|
|
|
val = up->corkflag;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case UDP_ENCAP:
|
|
|
|
val = up->encap_type;
|
|
|
|
break;
|
|
|
|
|
2006-11-27 22:10:57 +03:00
|
|
|
/* The following two cannot be changed on UDP sockets, the return is
|
|
|
|
* always 0 (which corresponds to the full checksum coverage of UDP). */
|
|
|
|
case UDPLITE_SEND_CSCOV:
|
|
|
|
val = up->pcslen;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case UDPLITE_RECV_CSCOV:
|
|
|
|
val = up->pcrlen;
|
|
|
|
break;
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
default:
|
|
|
|
return -ENOPROTOOPT;
|
2007-03-09 07:41:55 +03:00
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2007-03-09 07:41:55 +03:00
|
|
|
if (put_user(len, optlen))
|
2007-02-09 17:24:47 +03:00
|
|
|
return -EFAULT;
|
2009-07-17 04:26:32 +04:00
|
|
|
if (copy_to_user(optval, &val, len))
|
2005-04-17 02:20:36 +04:00
|
|
|
return -EFAULT;
|
2007-02-09 17:24:47 +03:00
|
|
|
return 0;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2009-07-17 04:26:32 +04:00
|
|
|
EXPORT_SYMBOL(udp_lib_getsockopt);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2008-03-07 03:22:02 +03:00
|
|
|
int udp_getsockopt(struct sock *sk, int level, int optname,
|
|
|
|
char __user *optval, int __user *optlen)
|
|
|
|
{
|
|
|
|
if (level == SOL_UDP || level == SOL_UDPLITE)
|
|
|
|
return udp_lib_getsockopt(sk, level, optname, optval, optlen);
|
|
|
|
return ip_getsockopt(sk, level, optname, optval, optlen);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
int compat_udp_getsockopt(struct sock *sk, int level, int optname,
|
|
|
|
char __user *optval, int __user *optlen)
|
|
|
|
{
|
|
|
|
if (level == SOL_UDP || level == SOL_UDPLITE)
|
|
|
|
return udp_lib_getsockopt(sk, level, optname, optval, optlen);
|
|
|
|
return compat_ip_getsockopt(sk, level, optname, optval, optlen);
|
|
|
|
}
|
|
|
|
#endif
|
2005-04-17 02:20:36 +04:00
|
|
|
/**
|
|
|
|
* udp_poll - wait for a UDP event.
|
|
|
|
* @file - file struct
|
|
|
|
* @sock - socket
|
|
|
|
* @wait - poll table
|
|
|
|
*
|
2007-02-09 17:24:47 +03:00
|
|
|
* This is same as datagram poll, except for the special case of
|
2005-04-17 02:20:36 +04:00
|
|
|
* blocking sockets. If application is using a blocking fd
|
|
|
|
* and a packet with checksum error is in the queue;
|
|
|
|
* then it could get return from select indicating data available
|
|
|
|
* but then block when reading it. Add special case code
|
|
|
|
* to work around these arguably broken applications.
|
|
|
|
*/
|
|
|
|
unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
|
|
|
|
{
|
|
|
|
unsigned int mask = datagram_poll(file, sock, wait);
|
|
|
|
struct sock *sk = sock->sk;
|
2006-11-27 22:10:57 +03:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/* Check for false positives due to checksum errors */
|
2009-10-09 08:43:40 +04:00
|
|
|
if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
|
|
|
|
!(sk->sk_shutdown & RCV_SHUTDOWN) && !first_packet_length(sk))
|
|
|
|
mask &= ~(POLLIN | POLLRDNORM);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
return mask;
|
2007-02-09 17:24:47 +03:00
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2009-07-17 04:26:32 +04:00
|
|
|
EXPORT_SYMBOL(udp_poll);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2008-03-07 03:22:02 +03:00
|
|
|
struct proto udp_prot = {
|
|
|
|
.name = "UDP",
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
.close = udp_lib_close,
|
|
|
|
.connect = ip4_datagram_connect,
|
|
|
|
.disconnect = udp_disconnect,
|
|
|
|
.ioctl = udp_ioctl,
|
|
|
|
.destroy = udp_destroy_sock,
|
|
|
|
.setsockopt = udp_setsockopt,
|
|
|
|
.getsockopt = udp_getsockopt,
|
|
|
|
.sendmsg = udp_sendmsg,
|
|
|
|
.recvmsg = udp_recvmsg,
|
|
|
|
.sendpage = udp_sendpage,
|
2008-09-15 22:48:46 +04:00
|
|
|
.backlog_rcv = __udp_queue_rcv_skb,
|
2008-03-07 03:22:02 +03:00
|
|
|
.hash = udp_lib_hash,
|
|
|
|
.unhash = udp_lib_unhash,
|
|
|
|
.get_port = udp_v4_get_port,
|
|
|
|
.memory_allocated = &udp_memory_allocated,
|
|
|
|
.sysctl_mem = sysctl_udp_mem,
|
|
|
|
.sysctl_wmem = &sysctl_udp_wmem_min,
|
|
|
|
.sysctl_rmem = &sysctl_udp_rmem_min,
|
|
|
|
.obj_size = sizeof(struct udp_sock),
|
udp: RCU handling for Unicast packets.
Goals are :
1) Optimizing handling of incoming Unicast UDP frames, so that no memory
writes should happen in the fast path.
Note: Multicasts and broadcasts still will need to take a lock,
because doing a full lockless lookup in this case is difficult.
2) No expensive operations in the socket bind/unhash phases :
- No expensive synchronize_rcu() calls.
- No added rcu_head in socket structure, increasing memory needs,
but more important, forcing us to use call_rcu() calls,
that have the bad property of making sockets structure cold.
(rcu grace period between socket freeing and its potential reuse
make this socket being cold in CPU cache).
David did a previous patch using call_rcu() and noticed a 20%
impact on TCP connection rates.
Quoting Cristopher Lameter :
"Right. That results in cacheline cooldown. You'd want to recycle
the object as they are cache hot on a per cpu basis. That is screwed
up by the delayed regular rcu processing. We have seen multiple
regressions due to cacheline cooldown.
The only choice in cacheline hot sensitive areas is to deal with the
complexity that comes with SLAB_DESTROY_BY_RCU or give up on RCU."
- Because udp sockets are allocated from dedicated kmem_cache,
use of SLAB_DESTROY_BY_RCU can help here.
Theory of operation :
---------------------
As the lookup is lockfree (using rcu_read_lock()/rcu_read_unlock()),
special attention must be taken by readers and writers.
Use of SLAB_DESTROY_BY_RCU is tricky too, because a socket can be freed,
reused, inserted in a different chain or in worst case in the same chain
while readers could do lookups in the same time.
In order to avoid loops, a reader must check each socket found in a chain
really belongs to the chain the reader was traversing. If it finds a
mismatch, lookup must start again at the begining. This *restart* loop
is the reason we had to use rdlock for the multicast case, because
we dont want to send same message several times to the same socket.
We use RCU only for fast path.
Thus, /proc/net/udp still takes spinlocks.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2008-10-29 12:11:14 +03:00
|
|
|
.slab_flags = SLAB_DESTROY_BY_RCU,
|
2008-10-29 11:41:45 +03:00
|
|
|
.h.udp_table = &udp_table,
|
2008-03-07 03:22:02 +03:00
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
.compat_setsockopt = compat_udp_setsockopt,
|
|
|
|
.compat_getsockopt = compat_udp_getsockopt,
|
|
|
|
#endif
|
|
|
|
};
|
2009-07-17 04:26:32 +04:00
|
|
|
EXPORT_SYMBOL(udp_prot);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
|
|
#ifdef CONFIG_PROC_FS
|
|
|
|
|
2008-10-29 11:41:45 +03:00
|
|
|
static struct sock *udp_get_first(struct seq_file *seq, int start)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
struct sock *sk;
|
|
|
|
struct udp_iter_state *state = seq->private;
|
2008-03-29 04:23:33 +03:00
|
|
|
struct net *net = seq_file_net(seq);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2009-10-07 04:37:59 +04:00
|
|
|
for (state->bucket = start; state->bucket <= state->udp_table->mask;
|
|
|
|
++state->bucket) {
|
2008-11-17 06:39:21 +03:00
|
|
|
struct hlist_nulls_node *node;
|
2008-10-29 11:41:45 +03:00
|
|
|
struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
|
2009-10-07 04:37:59 +04:00
|
|
|
|
|
|
|
if (hlist_nulls_empty(&hslot->head))
|
|
|
|
continue;
|
|
|
|
|
2008-10-29 11:41:45 +03:00
|
|
|
spin_lock_bh(&hslot->lock);
|
2008-11-17 06:39:21 +03:00
|
|
|
sk_nulls_for_each(sk, node, &hslot->head) {
|
2008-03-25 21:57:35 +03:00
|
|
|
if (!net_eq(sock_net(sk), net))
|
2008-03-21 14:11:58 +03:00
|
|
|
continue;
|
2005-04-17 02:20:36 +04:00
|
|
|
if (sk->sk_family == state->family)
|
|
|
|
goto found;
|
|
|
|
}
|
2008-10-29 11:41:45 +03:00
|
|
|
spin_unlock_bh(&hslot->lock);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
sk = NULL;
|
|
|
|
found:
|
|
|
|
return sk;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
|
|
|
|
{
|
|
|
|
struct udp_iter_state *state = seq->private;
|
2008-03-29 04:23:33 +03:00
|
|
|
struct net *net = seq_file_net(seq);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
do {
|
2008-11-17 06:39:21 +03:00
|
|
|
sk = sk_nulls_next(sk);
|
2008-03-25 21:57:35 +03:00
|
|
|
} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2008-10-29 11:41:45 +03:00
|
|
|
if (!sk) {
|
2009-10-07 04:37:59 +04:00
|
|
|
if (state->bucket <= state->udp_table->mask)
|
2009-03-24 01:22:33 +03:00
|
|
|
spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
|
2008-10-29 11:41:45 +03:00
|
|
|
return udp_get_first(seq, state->bucket + 1);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
return sk;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
|
|
|
|
{
|
2008-10-29 11:41:45 +03:00
|
|
|
struct sock *sk = udp_get_first(seq, 0);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
|
|
|
if (sk)
|
2007-03-09 07:41:55 +03:00
|
|
|
while (pos && (sk = udp_get_next(seq, sk)) != NULL)
|
2005-04-17 02:20:36 +04:00
|
|
|
--pos;
|
|
|
|
return pos ? NULL : sk;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
|
|
|
|
{
|
2009-03-24 01:22:33 +03:00
|
|
|
struct udp_iter_state *state = seq->private;
|
2009-10-07 04:37:59 +04:00
|
|
|
state->bucket = MAX_UDP_PORTS;
|
2009-03-24 01:22:33 +03:00
|
|
|
|
2008-04-01 06:38:15 +04:00
|
|
|
return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
|
|
|
{
|
|
|
|
struct sock *sk;
|
|
|
|
|
2008-04-01 06:38:15 +04:00
|
|
|
if (v == SEQ_START_TOKEN)
|
2005-04-17 02:20:36 +04:00
|
|
|
sk = udp_get_idx(seq, 0);
|
|
|
|
else
|
|
|
|
sk = udp_get_next(seq, v);
|
|
|
|
|
|
|
|
++*pos;
|
|
|
|
return sk;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void udp_seq_stop(struct seq_file *seq, void *v)
|
|
|
|
{
|
2008-10-29 11:41:45 +03:00
|
|
|
struct udp_iter_state *state = seq->private;
|
|
|
|
|
2009-10-07 04:37:59 +04:00
|
|
|
if (state->bucket <= state->udp_table->mask)
|
2008-10-29 11:41:45 +03:00
|
|
|
spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static int udp_seq_open(struct inode *inode, struct file *file)
|
|
|
|
{
|
|
|
|
struct udp_seq_afinfo *afinfo = PDE(inode)->data;
|
2008-03-29 04:25:06 +03:00
|
|
|
struct udp_iter_state *s;
|
|
|
|
int err;
|
2008-03-21 14:11:58 +03:00
|
|
|
|
2008-03-29 04:25:06 +03:00
|
|
|
err = seq_open_net(inode, file, &afinfo->seq_ops,
|
|
|
|
sizeof(struct udp_iter_state));
|
|
|
|
if (err < 0)
|
|
|
|
return err;
|
2008-03-21 14:11:58 +03:00
|
|
|
|
2008-03-29 04:25:06 +03:00
|
|
|
s = ((struct seq_file *)file->private_data)->private;
|
2005-04-17 02:20:36 +04:00
|
|
|
s->family = afinfo->family;
|
2008-10-29 11:41:45 +03:00
|
|
|
s->udp_table = afinfo->udp_table;
|
2008-03-29 04:25:06 +03:00
|
|
|
return err;
|
2008-03-21 14:11:58 +03:00
|
|
|
}
|
|
|
|
|
2005-04-17 02:20:36 +04:00
|
|
|
/* ------------------------------------------------------------------------ */
|
2008-03-21 14:14:17 +03:00
|
|
|
int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
|
|
|
struct proc_dir_entry *p;
|
|
|
|
int rc = 0;
|
|
|
|
|
2008-03-29 04:25:32 +03:00
|
|
|
afinfo->seq_fops.open = udp_seq_open;
|
|
|
|
afinfo->seq_fops.read = seq_read;
|
|
|
|
afinfo->seq_fops.llseek = seq_lseek;
|
|
|
|
afinfo->seq_fops.release = seq_release_net;
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2008-03-29 04:24:26 +03:00
|
|
|
afinfo->seq_ops.start = udp_seq_start;
|
|
|
|
afinfo->seq_ops.next = udp_seq_next;
|
|
|
|
afinfo->seq_ops.stop = udp_seq_stop;
|
|
|
|
|
2008-05-02 15:10:08 +04:00
|
|
|
p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
|
|
|
|
&afinfo->seq_fops, afinfo);
|
|
|
|
if (!p)
|
2005-04-17 02:20:36 +04:00
|
|
|
rc = -ENOMEM;
|
|
|
|
return rc;
|
|
|
|
}
|
2009-07-17 04:26:32 +04:00
|
|
|
EXPORT_SYMBOL(udp_proc_register);
|
2005-04-17 02:20:36 +04:00
|
|
|
|
2008-03-21 14:14:17 +03:00
|
|
|
void udp_proc_unregister(struct net *net, struct udp_seq_afinfo *afinfo)
|
2005-04-17 02:20:36 +04:00
|
|
|
{
|
2008-03-21 14:14:17 +03:00
|
|
|
proc_net_remove(net, afinfo->name);
|
2005-04-17 02:20:36 +04:00
|
|
|
}
|
2009-07-17 04:26:32 +04:00
|
|
|
EXPORT_SYMBOL(udp_proc_unregister);
|
2008-03-07 03:22:02 +03:00
|
|
|
|
|
|
|
/* ------------------------------------------------------------------------ */
|
2008-04-24 12:02:16 +04:00
|
|
|
static void udp4_format_sock(struct sock *sp, struct seq_file *f,
|
|
|
|
int bucket, int *len)
|
2008-03-07 03:22:02 +03:00
|
|
|
{
|
|
|
|
struct inet_sock *inet = inet_sk(sp);
|
2009-10-15 10:30:45 +04:00
|
|
|
__be32 dest = inet->inet_daddr;
|
|
|
|
__be32 src = inet->inet_rcv_saddr;
|
|
|
|
__u16 destp = ntohs(inet->inet_dport);
|
|
|
|
__u16 srcp = ntohs(inet->inet_sport);
|
2008-03-07 03:22:02 +03:00
|
|
|
|
2009-10-07 04:37:59 +04:00
|
|
|
seq_printf(f, "%5d: %08X:%04X %08X:%04X"
|
2008-06-18 08:04:56 +04:00
|
|
|
" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
|
2008-03-07 03:22:02 +03:00
|
|
|
bucket, src, srcp, dest, destp, sp->sk_state,
|
2009-06-18 06:05:41 +04:00
|
|
|
sk_wmem_alloc_get(sp),
|
|
|
|
sk_rmem_alloc_get(sp),
|
2008-03-07 03:22:02 +03:00
|
|
|
0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
|
2008-06-18 08:04:56 +04:00
|
|
|
atomic_read(&sp->sk_refcnt), sp,
|
|
|
|
atomic_read(&sp->sk_drops), len);
|
2008-03-07 03:22:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
int udp4_seq_show(struct seq_file *seq, void *v)
|
|
|
|
{
|
|
|
|
if (v == SEQ_START_TOKEN)
|
|
|
|
seq_printf(seq, "%-127s\n",
|
|
|
|
" sl local_address rem_address st tx_queue "
|
|
|
|
"rx_queue tr tm->when retrnsmt uid timeout "
|
2008-06-18 08:04:56 +04:00
|
|
|
"inode ref pointer drops");
|
2008-03-07 03:22:02 +03:00
|
|
|
else {
|
|
|
|
struct udp_iter_state *state = seq->private;
|
2008-04-24 12:02:16 +04:00
|
|
|
int len;
|
2008-03-07 03:22:02 +03:00
|
|
|
|
2008-04-24 12:02:16 +04:00
|
|
|
udp4_format_sock(v, seq, state->bucket, &len);
|
2009-07-17 04:26:32 +04:00
|
|
|
seq_printf(seq, "%*s\n", 127 - len, "");
|
2008-03-07 03:22:02 +03:00
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* ------------------------------------------------------------------------ */
|
|
|
|
static struct udp_seq_afinfo udp4_seq_afinfo = {
|
|
|
|
.name = "udp",
|
|
|
|
.family = AF_INET,
|
2008-10-29 11:41:45 +03:00
|
|
|
.udp_table = &udp_table,
|
2008-03-29 04:25:53 +03:00
|
|
|
.seq_fops = {
|
|
|
|
.owner = THIS_MODULE,
|
|
|
|
},
|
2008-03-29 04:24:26 +03:00
|
|
|
.seq_ops = {
|
|
|
|
.show = udp4_seq_show,
|
|
|
|
},
|
2008-03-07 03:22:02 +03:00
|
|
|
};
|
|
|
|
|
2010-01-17 06:35:32 +03:00
|
|
|
static int __net_init udp4_proc_init_net(struct net *net)
|
2008-03-25 00:53:49 +03:00
|
|
|
{
|
|
|
|
return udp_proc_register(net, &udp4_seq_afinfo);
|
|
|
|
}
|
|
|
|
|
2010-01-17 06:35:32 +03:00
|
|
|
static void __net_exit udp4_proc_exit_net(struct net *net)
|
2008-03-25 00:53:49 +03:00
|
|
|
{
|
|
|
|
udp_proc_unregister(net, &udp4_seq_afinfo);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct pernet_operations udp4_net_ops = {
|
|
|
|
.init = udp4_proc_init_net,
|
|
|
|
.exit = udp4_proc_exit_net,
|
|
|
|
};
|
|
|
|
|
2008-03-07 03:22:02 +03:00
|
|
|
int __init udp4_proc_init(void)
|
|
|
|
{
|
2008-03-25 00:53:49 +03:00
|
|
|
return register_pernet_subsys(&udp4_net_ops);
|
2008-03-07 03:22:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
void udp4_proc_exit(void)
|
|
|
|
{
|
2008-03-25 00:53:49 +03:00
|
|
|
unregister_pernet_subsys(&udp4_net_ops);
|
2008-03-07 03:22:02 +03:00
|
|
|
}
|
2005-04-17 02:20:36 +04:00
|
|
|
#endif /* CONFIG_PROC_FS */
|
|
|
|
|
2009-10-07 04:37:59 +04:00
|
|
|
static __initdata unsigned long uhash_entries;
|
|
|
|
static int __init set_uhash_entries(char *str)
|
2008-10-29 11:41:45 +03:00
|
|
|
{
|
2009-10-07 04:37:59 +04:00
|
|
|
if (!str)
|
|
|
|
return 0;
|
|
|
|
uhash_entries = simple_strtoul(str, &str, 0);
|
|
|
|
if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
|
|
|
|
uhash_entries = UDP_HTABLE_SIZE_MIN;
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
__setup("uhash_entries=", set_uhash_entries);
|
2008-10-29 11:41:45 +03:00
|
|
|
|
2009-10-07 04:37:59 +04:00
|
|
|
void __init udp_table_init(struct udp_table *table, const char *name)
|
|
|
|
{
|
|
|
|
unsigned int i;
|
|
|
|
|
|
|
|
if (!CONFIG_BASE_SMALL)
|
|
|
|
table->hash = alloc_large_system_hash(name,
|
2009-11-08 13:17:58 +03:00
|
|
|
2 * sizeof(struct udp_hslot),
|
2009-10-07 04:37:59 +04:00
|
|
|
uhash_entries,
|
|
|
|
21, /* one slot per 2 MB */
|
|
|
|
0,
|
|
|
|
&table->log,
|
|
|
|
&table->mask,
|
|
|
|
64 * 1024);
|
|
|
|
/*
|
|
|
|
* Make sure hash table has the minimum size
|
|
|
|
*/
|
|
|
|
if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
|
|
|
|
table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
|
2009-11-08 13:17:58 +03:00
|
|
|
2 * sizeof(struct udp_hslot), GFP_KERNEL);
|
2009-10-07 04:37:59 +04:00
|
|
|
if (!table->hash)
|
|
|
|
panic(name);
|
|
|
|
table->log = ilog2(UDP_HTABLE_SIZE_MIN);
|
|
|
|
table->mask = UDP_HTABLE_SIZE_MIN - 1;
|
|
|
|
}
|
2009-11-08 13:17:58 +03:00
|
|
|
table->hash2 = table->hash + (table->mask + 1);
|
2009-10-07 04:37:59 +04:00
|
|
|
for (i = 0; i <= table->mask; i++) {
|
2008-11-17 06:39:21 +03:00
|
|
|
INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
|
2009-11-08 13:17:05 +03:00
|
|
|
table->hash[i].count = 0;
|
2008-10-29 11:41:45 +03:00
|
|
|
spin_lock_init(&table->hash[i].lock);
|
|
|
|
}
|
2009-11-08 13:17:58 +03:00
|
|
|
for (i = 0; i <= table->mask; i++) {
|
|
|
|
INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
|
|
|
|
table->hash2[i].count = 0;
|
|
|
|
spin_lock_init(&table->hash2[i].lock);
|
|
|
|
}
|
2008-10-29 11:41:45 +03:00
|
|
|
}
|
|
|
|
|
2007-12-31 11:29:24 +03:00
|
|
|
void __init udp_init(void)
|
|
|
|
{
|
2008-10-29 12:32:32 +03:00
|
|
|
unsigned long nr_pages, limit;
|
2007-12-31 11:29:24 +03:00
|
|
|
|
2009-10-07 04:37:59 +04:00
|
|
|
udp_table_init(&udp_table, "UDP");
|
2007-12-31 11:29:24 +03:00
|
|
|
/* Set the pressure threshold up by the same strategy of TCP. It is a
|
|
|
|
* fraction of global memory that is up to 1/2 at 256 MB, decreasing
|
|
|
|
* toward zero with the amount of memory, with a floor of 128 pages.
|
|
|
|
*/
|
2008-10-29 12:32:32 +03:00
|
|
|
nr_pages = totalram_pages - totalhigh_pages;
|
|
|
|
limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT);
|
|
|
|
limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11);
|
2007-12-31 11:29:24 +03:00
|
|
|
limit = max(limit, 128UL);
|
|
|
|
sysctl_udp_mem[0] = limit / 4 * 3;
|
|
|
|
sysctl_udp_mem[1] = limit;
|
|
|
|
sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
|
|
|
|
|
|
|
|
sysctl_udp_rmem_min = SK_MEM_QUANTUM;
|
|
|
|
sysctl_udp_wmem_min = SK_MEM_QUANTUM;
|
|
|
|
}
|
|
|
|
|
2009-07-09 12:09:47 +04:00
|
|
|
int udp4_ufo_send_check(struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
const struct iphdr *iph;
|
|
|
|
struct udphdr *uh;
|
|
|
|
|
|
|
|
if (!pskb_may_pull(skb, sizeof(*uh)))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
iph = ip_hdr(skb);
|
|
|
|
uh = udp_hdr(skb);
|
|
|
|
|
|
|
|
uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
|
|
|
|
IPPROTO_UDP, 0);
|
|
|
|
skb->csum_start = skb_transport_header(skb) - skb->head;
|
|
|
|
skb->csum_offset = offsetof(struct udphdr, check);
|
|
|
|
skb->ip_summed = CHECKSUM_PARTIAL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features)
|
|
|
|
{
|
|
|
|
struct sk_buff *segs = ERR_PTR(-EINVAL);
|
|
|
|
unsigned int mss;
|
|
|
|
int offset;
|
|
|
|
__wsum csum;
|
|
|
|
|
|
|
|
mss = skb_shinfo(skb)->gso_size;
|
|
|
|
if (unlikely(skb->len <= mss))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) {
|
|
|
|
/* Packet is from an untrusted source, reset gso_segs. */
|
|
|
|
int type = skb_shinfo(skb)->gso_type;
|
|
|
|
|
|
|
|
if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY) ||
|
|
|
|
!(type & (SKB_GSO_UDP))))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
|
|
|
|
|
|
|
|
segs = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
|
|
|
|
* do checksum of UDP packets sent as multiple IP fragments.
|
|
|
|
*/
|
|
|
|
offset = skb->csum_start - skb_headroom(skb);
|
2009-07-17 04:26:32 +04:00
|
|
|
csum = skb_checksum(skb, offset, skb->len - offset, 0);
|
2009-07-09 12:09:47 +04:00
|
|
|
offset += skb->csum_offset;
|
|
|
|
*(__sum16 *)(skb->data + offset) = csum_fold(csum);
|
|
|
|
skb->ip_summed = CHECKSUM_NONE;
|
|
|
|
|
|
|
|
/* Fragment the skb. IP headers of the fragments are updated in
|
|
|
|
* inet_gso_segment()
|
|
|
|
*/
|
|
|
|
segs = skb_segment(skb, features);
|
|
|
|
out:
|
|
|
|
return segs;
|
|
|
|
}
|
|
|
|
|