diff --git a/MAINTAINERS b/MAINTAINERS index 9d263b899901..9fa5b2f69212 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11727,6 +11727,7 @@ W: https://github.com/multipath-tcp/mptcp_net-next/wiki B: https://github.com/multipath-tcp/mptcp_net-next/issues S: Maintained F: include/net/mptcp.h +F: include/uapi/linux/mptcp.h F: net/mptcp/ F: tools/testing/selftests/net/mptcp/ diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3dc964010fef..421c99c12291 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -86,9 +86,19 @@ struct mptcp_options_received { u64 data_seq; u32 subflow_seq; u16 data_len; - u8 mp_capable : 1, + u16 mp_capable : 1, mp_join : 1, - dss : 1; + dss : 1, + add_addr : 1, + rm_addr : 1, + family : 4, + echo : 1, + backup : 1; + u32 token; + u32 nonce; + u64 thmac; + u8 hmac[20]; + u8 join_id; u8 use_map:1, dsn64:1, data_fin:1, @@ -96,6 +106,16 @@ struct mptcp_options_received { ack64:1, mpc_map:1, __unused:2; + u8 addr_id; + u8 rm_id; + union { + struct in_addr addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + struct in6_addr addr6; +#endif + }; + u64 ahmac; + u16 port; }; #endif @@ -131,6 +151,8 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt) #if IS_ENABLED(CONFIG_MPTCP) rx_opt->mptcp.mp_capable = 0; rx_opt->mptcp.mp_join = 0; + rx_opt->mptcp.add_addr = 0; + rx_opt->mptcp.rm_addr = 0; rx_opt->mptcp.dss = 0; #endif } diff --git a/include/net/mptcp.h b/include/net/mptcp.h index c971d25431ea..0e7c5471010b 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -12,6 +12,8 @@ #include #include +struct seq_file; + /* MPTCP sk_buff extension data */ struct mptcp_ext { u64 data_ack; @@ -33,6 +35,21 @@ struct mptcp_out_options { u16 suboptions; u64 sndr_key; u64 rcvr_key; + union { + struct in_addr addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + struct in6_addr addr6; +#endif + }; + u8 addr_id; + u64 ahmac; + u8 rm_id; + u8 join_id; + u8 backup; + u32 nonce; + u64 thmac; + u32 token; + u8 hmac[20]; struct mptcp_ext ext_copy; #endif }; @@ -106,6 +123,9 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, skb_ext_find(from, SKB_EXT_MPTCP)); } +bool mptcp_sk_is_subflow(const struct sock *sk); + +void mptcp_seq_show(struct seq_file *seq); #else static inline void mptcp_init(void) @@ -172,6 +192,12 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, return true; } +static inline bool mptcp_sk_is_subflow(const struct sock *sk) +{ + return false; +} + +static inline void mptcp_seq_show(struct seq_file *seq) { } #endif /* CONFIG_MPTCP */ #if IS_ENABLED(CONFIG_MPTCP_IPV6) diff --git a/include/net/netns/mib.h b/include/net/netns/mib.h index b5fdb108d602..59b2c3a3db42 100644 --- a/include/net/netns/mib.h +++ b/include/net/netns/mib.h @@ -27,6 +27,9 @@ struct netns_mib { #if IS_ENABLED(CONFIG_TLS) DEFINE_SNMP_STAT(struct linux_tls_mib, tls_statistics); #endif +#ifdef CONFIG_MPTCP + DEFINE_SNMP_STAT(struct mptcp_mib, mptcp_statistics); +#endif }; #endif diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 75dffd78363a..57cc429a9177 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -166,6 +166,7 @@ enum { INET_ULP_INFO_UNSPEC, INET_ULP_INFO_NAME, INET_ULP_INFO_TLS, + INET_ULP_INFO_MPTCP, __INET_ULP_INFO_MAX, }; #define INET_ULP_INFO_MAX (__INET_ULP_INFO_MAX - 1) diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h new file mode 100644 index 000000000000..5f2c77082d9e --- /dev/null +++ b/include/uapi/linux/mptcp.h @@ -0,0 +1,89 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +#ifndef _UAPI_MPTCP_H +#define _UAPI_MPTCP_H + +#include +#include + +#define MPTCP_SUBFLOW_FLAG_MCAP_REM _BITUL(0) +#define MPTCP_SUBFLOW_FLAG_MCAP_LOC _BITUL(1) +#define MPTCP_SUBFLOW_FLAG_JOIN_REM _BITUL(2) +#define MPTCP_SUBFLOW_FLAG_JOIN_LOC _BITUL(3) +#define MPTCP_SUBFLOW_FLAG_BKUP_REM _BITUL(4) +#define MPTCP_SUBFLOW_FLAG_BKUP_LOC _BITUL(5) +#define MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED _BITUL(6) +#define MPTCP_SUBFLOW_FLAG_CONNECTED _BITUL(7) +#define MPTCP_SUBFLOW_FLAG_MAPVALID _BITUL(8) + +enum { + MPTCP_SUBFLOW_ATTR_UNSPEC, + MPTCP_SUBFLOW_ATTR_TOKEN_REM, + MPTCP_SUBFLOW_ATTR_TOKEN_LOC, + MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, + MPTCP_SUBFLOW_ATTR_MAP_SEQ, + MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, + MPTCP_SUBFLOW_ATTR_SSN_OFFSET, + MPTCP_SUBFLOW_ATTR_MAP_DATALEN, + MPTCP_SUBFLOW_ATTR_FLAGS, + MPTCP_SUBFLOW_ATTR_ID_REM, + MPTCP_SUBFLOW_ATTR_ID_LOC, + MPTCP_SUBFLOW_ATTR_PAD, + __MPTCP_SUBFLOW_ATTR_MAX +}; + +#define MPTCP_SUBFLOW_ATTR_MAX (__MPTCP_SUBFLOW_ATTR_MAX - 1) + +/* netlink interface */ +#define MPTCP_PM_NAME "mptcp_pm" +#define MPTCP_PM_CMD_GRP_NAME "mptcp_pm_cmds" +#define MPTCP_PM_VER 0x1 + +/* + * ATTR types defined for MPTCP + */ +enum { + MPTCP_PM_ATTR_UNSPEC, + + MPTCP_PM_ATTR_ADDR, /* nested address */ + MPTCP_PM_ATTR_RCV_ADD_ADDRS, /* u32 */ + MPTCP_PM_ATTR_SUBFLOWS, /* u32 */ + + __MPTCP_PM_ATTR_MAX +}; + +#define MPTCP_PM_ATTR_MAX (__MPTCP_PM_ATTR_MAX - 1) + +enum { + MPTCP_PM_ADDR_ATTR_UNSPEC, + + MPTCP_PM_ADDR_ATTR_FAMILY, /* u16 */ + MPTCP_PM_ADDR_ATTR_ID, /* u8 */ + MPTCP_PM_ADDR_ATTR_ADDR4, /* struct in_addr */ + MPTCP_PM_ADDR_ATTR_ADDR6, /* struct in6_addr */ + MPTCP_PM_ADDR_ATTR_PORT, /* u16 */ + MPTCP_PM_ADDR_ATTR_FLAGS, /* u32 */ + MPTCP_PM_ADDR_ATTR_IF_IDX, /* s32 */ + + __MPTCP_PM_ADDR_ATTR_MAX +}; + +#define MPTCP_PM_ADDR_ATTR_MAX (__MPTCP_PM_ADDR_ATTR_MAX - 1) + +#define MPTCP_PM_ADDR_FLAG_SIGNAL (1 << 0) +#define MPTCP_PM_ADDR_FLAG_SUBFLOW (1 << 1) +#define MPTCP_PM_ADDR_FLAG_BACKUP (1 << 2) + +enum { + MPTCP_PM_CMD_UNSPEC, + + MPTCP_PM_CMD_ADD_ADDR, + MPTCP_PM_CMD_DEL_ADDR, + MPTCP_PM_CMD_GET_ADDR, + MPTCP_PM_CMD_FLUSH_ADDRS, + MPTCP_PM_CMD_SET_LIMITS, + MPTCP_PM_CMD_GET_LIMITS, + + __MPTCP_PM_CMD_AFTER_LAST +}; + +#endif /* _UAPI_MPTCP_H */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index bd7b4e92e07f..cf58e29cf746 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1793,6 +1793,10 @@ static __net_exit void ipv4_mib_exit_net(struct net *net) free_percpu(net->mib.net_statistics); free_percpu(net->mib.ip_statistics); free_percpu(net->mib.tcp_statistics); +#ifdef CONFIG_MPTCP + /* allocated on demand, see mptcp_init_sock() */ + free_percpu(net->mib.mptcp_statistics); +#endif } static __net_initdata struct pernet_operations ipv4_mib_ops = { diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 2580303249e2..75545a829a2b 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -485,6 +486,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) offsetof(struct ipstats_mib, syncp))); seq_putc(seq, '\n'); + mptcp_seq_show(seq); return 0; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 03af7c3e75ef..7e40322cc5ec 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -774,6 +774,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!child) goto listen_overflow; + if (own_req && sk_is_mptcp(child) && mptcp_sk_is_subflow(child)) { + reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); + inet_csk_reqsk_queue_drop_and_put(sk, req); + return child; + } + sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); *req_stolen = !own_req; diff --git a/net/mptcp/Makefile b/net/mptcp/Makefile index 4e98d9edfd0a..baa0640527c7 100644 --- a/net/mptcp/Makefile +++ b/net/mptcp/Makefile @@ -1,4 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_MPTCP) += mptcp.o -mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o +mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ + mib.o pm_netlink.o diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index 40d1bb18fd60..c151628bd416 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -44,8 +44,7 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn) *idsn = be64_to_cpu(*((__be64 *)&mptcp_hashed_key[6])); } -void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, - void *hmac) +void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) { u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; __be32 mptcp_hashed_key[SHA256_DIGEST_WORDS]; @@ -55,6 +54,9 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, u8 key2be[8]; int i; + if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE)) + len = SHA256_DIGEST_SIZE; + put_unaligned_be64(key1, key1be); put_unaligned_be64(key2, key2be); @@ -65,11 +67,10 @@ void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, for (i = 0; i < 8; i++) input[i + 8] ^= key2be[i]; - put_unaligned_be32(nonce1, &input[SHA256_BLOCK_SIZE]); - put_unaligned_be32(nonce2, &input[SHA256_BLOCK_SIZE + 4]); + memcpy(&input[SHA256_BLOCK_SIZE], msg, len); sha256_init(&state); - sha256_update(&state, input, SHA256_BLOCK_SIZE + 8); + sha256_update(&state, input, SHA256_BLOCK_SIZE + len); /* emit sha256(K1 || msg) on the second input block, so we can * reuse 'input' for the last hashing @@ -125,6 +126,7 @@ static int __init test_mptcp_crypto(void) char hmac[20], hmac_hex[41]; u32 nonce1, nonce2; u64 key1, key2; + u8 msg[8]; int i, j; for (i = 0; i < ARRAY_SIZE(tests); ++i) { @@ -134,7 +136,10 @@ static int __init test_mptcp_crypto(void) nonce1 = be32_to_cpu(*((__be32 *)&tests[i].msg[0])); nonce2 = be32_to_cpu(*((__be32 *)&tests[i].msg[4])); - mptcp_crypto_hmac_sha(key1, key2, nonce1, nonce2, hmac); + put_unaligned_be32(nonce1, &msg[0]); + put_unaligned_be32(nonce2, &msg[4]); + + mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); for (j = 0; j < 20; ++j) sprintf(&hmac_hex[j << 1], "%02x", hmac[j] & 0xff); hmac_hex[40] = 0; diff --git a/net/mptcp/diag.c b/net/mptcp/diag.c new file mode 100644 index 000000000000..a536586742f2 --- /dev/null +++ b/net/mptcp/diag.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* MPTCP socket monitoring support + * + * Copyright (c) 2019 Red Hat + * + * Author: Davide Caratti + */ + +#include +#include +#include +#include +#include +#include "protocol.h" + +static int subflow_get_info(const struct sock *sk, struct sk_buff *skb) +{ + struct mptcp_subflow_context *sf; + struct nlattr *start; + u32 flags = 0; + int err; + + start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP); + if (!start) + return -EMSGSIZE; + + rcu_read_lock(); + sf = rcu_dereference(inet_csk(sk)->icsk_ulp_data); + if (!sf) { + err = 0; + goto nla_failure; + } + + if (sf->mp_capable) + flags |= MPTCP_SUBFLOW_FLAG_MCAP_REM; + if (sf->request_mptcp) + flags |= MPTCP_SUBFLOW_FLAG_MCAP_LOC; + if (sf->mp_join) + flags |= MPTCP_SUBFLOW_FLAG_JOIN_REM; + if (sf->request_join) + flags |= MPTCP_SUBFLOW_FLAG_JOIN_LOC; + if (sf->backup) + flags |= MPTCP_SUBFLOW_FLAG_BKUP_REM; + if (sf->request_bkup) + flags |= MPTCP_SUBFLOW_FLAG_BKUP_LOC; + if (sf->fully_established) + flags |= MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED; + if (sf->conn_finished) + flags |= MPTCP_SUBFLOW_FLAG_CONNECTED; + if (sf->map_valid) + flags |= MPTCP_SUBFLOW_FLAG_MAPVALID; + + if (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_REM, sf->remote_token) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_LOC, sf->token) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, + sf->rel_write_seq) || + nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq, + MPTCP_SUBFLOW_ATTR_PAD) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, + sf->map_subflow_seq) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) || + nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN, + sf->map_data_len) || + nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) || + nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) || + nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, sf->local_id)) { + err = -EMSGSIZE; + goto nla_failure; + } + + rcu_read_unlock(); + nla_nest_end(skb, start); + return 0; + +nla_failure: + rcu_read_unlock(); + nla_nest_cancel(skb, start); + return err; +} + +static size_t subflow_get_info_size(const struct sock *sk) +{ + size_t size = 0; + + size += nla_total_size(0) + /* INET_ULP_INFO_MPTCP */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_REM */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_LOC */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */ + nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */ + nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */ + nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */ + nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */ + nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */ + nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_LOC */ + 0; + return size; +} + +void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops) +{ + ops->get_info = subflow_get_info; + ops->get_info_size = subflow_get_info_size; +} diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c new file mode 100644 index 000000000000..0a6a15f3456d --- /dev/null +++ b/net/mptcp/mib.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include + +#include "mib.h" + +static const struct snmp_mib mptcp_snmp_list[] = { + SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE), + SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK), + SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK), + SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), + SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), + SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), + SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX), + SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX), + SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), + SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), + SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), + SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), + SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), + SNMP_MIB_SENTINEL +}; + +/* mptcp_mib_alloc - allocate percpu mib counters + * + * These are allocated when the first mptcp socket is created so + * we do not waste percpu memory if mptcp isn't in use. + */ +bool mptcp_mib_alloc(struct net *net) +{ + struct mptcp_mib __percpu *mib = alloc_percpu(struct mptcp_mib); + + if (!mib) + return false; + + if (cmpxchg(&net->mib.mptcp_statistics, NULL, mib)) + free_percpu(mib); + + return true; +} + +void mptcp_seq_show(struct seq_file *seq) +{ + struct net *net = seq->private; + int i; + + seq_puts(seq, "MPTcpExt:"); + for (i = 0; mptcp_snmp_list[i].name; i++) + seq_printf(seq, " %s", mptcp_snmp_list[i].name); + + seq_puts(seq, "\nMPTcpExt:"); + + if (!net->mib.mptcp_statistics) { + for (i = 0; mptcp_snmp_list[i].name; i++) + seq_puts(seq, " 0"); + + return; + } + + for (i = 0; mptcp_snmp_list[i].name; i++) + seq_printf(seq, " %lu", + snmp_fold_field(net->mib.mptcp_statistics, + mptcp_snmp_list[i].entry)); + seq_putc(seq, '\n'); +} diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h new file mode 100644 index 000000000000..d7de340fc997 --- /dev/null +++ b/net/mptcp/mib.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +enum linux_mptcp_mib_field { + MPTCP_MIB_NUM = 0, + MPTCP_MIB_MPCAPABLEPASSIVE, /* Received SYN with MP_CAPABLE */ + MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */ + MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */ + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */ + MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ + MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ + MPTCP_MIB_JOINSYNRX, /* Received a SYN + MP_JOIN */ + MPTCP_MIB_JOINSYNACKRX, /* Received a SYN/ACK + MP_JOIN */ + MPTCP_MIB_JOINSYNACKMAC, /* HMAC was wrong on SYN/ACK + MP_JOIN */ + MPTCP_MIB_JOINACKRX, /* Received an ACK + MP_JOIN */ + MPTCP_MIB_JOINACKMAC, /* HMAC was wrong on ACK + MP_JOIN */ + MPTCP_MIB_DSSNOMATCH, /* Received a new mapping that did not match the previous one */ + MPTCP_MIB_INFINITEMAPRX, /* Received an infinite mapping */ + __MPTCP_MIB_MAX +}; + +#define LINUX_MIB_MPTCP_MAX __MPTCP_MIB_MAX +struct mptcp_mib { + unsigned long mibs[LINUX_MIB_MPTCP_MAX]; +}; + +static inline void MPTCP_INC_STATS(struct net *net, + enum linux_mptcp_mib_field field) +{ + if (likely(net->mib.mptcp_statistics)) + SNMP_INC_STATS(net->mib.mptcp_statistics, field); +} + +static inline void __MPTCP_INC_STATS(struct net *net, + enum linux_mptcp_mib_field field) +{ + if (likely(net->mib.mptcp_statistics)) + __SNMP_INC_STATS(net->mib.mptcp_statistics, field); +} + +bool mptcp_mib_alloc(struct net *net); diff --git a/net/mptcp/options.c b/net/mptcp/options.c index aea1a62d9999..bd220ee4aac9 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -96,6 +96,38 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, mp_opt->rcvr_key, mp_opt->data_len); break; + case MPTCPOPT_MP_JOIN: + mp_opt->mp_join = 1; + if (opsize == TCPOLEN_MPTCP_MPJ_SYN) { + mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; + mp_opt->join_id = *ptr++; + mp_opt->token = get_unaligned_be32(ptr); + ptr += 4; + mp_opt->nonce = get_unaligned_be32(ptr); + ptr += 4; + pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u", + mp_opt->backup, mp_opt->join_id, + mp_opt->token, mp_opt->nonce); + } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) { + mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; + mp_opt->join_id = *ptr++; + mp_opt->thmac = get_unaligned_be64(ptr); + ptr += 8; + mp_opt->nonce = get_unaligned_be32(ptr); + ptr += 4; + pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u", + mp_opt->backup, mp_opt->join_id, + mp_opt->thmac, mp_opt->nonce); + } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) { + ptr += 2; + memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); + pr_debug("MP_JOIN hmac"); + } else { + pr_warn("MP_JOIN bad option size"); + mp_opt->mp_join = 0; + } + break; + case MPTCPOPT_DSS: pr_debug("DSS"); ptr++; @@ -178,6 +210,71 @@ void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, break; + case MPTCPOPT_ADD_ADDR: + mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO; + if (!mp_opt->echo) { + if (opsize == TCPOLEN_MPTCP_ADD_ADDR || + opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT) + mp_opt->family = MPTCP_ADDR_IPVERSION_4; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 || + opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT) + mp_opt->family = MPTCP_ADDR_IPVERSION_6; +#endif + else + break; + } else { + if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE || + opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) + mp_opt->family = MPTCP_ADDR_IPVERSION_4; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE || + opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) + mp_opt->family = MPTCP_ADDR_IPVERSION_6; +#endif + else + break; + } + + mp_opt->add_addr = 1; + mp_opt->port = 0; + mp_opt->addr_id = *ptr++; + pr_debug("ADD_ADDR: id=%d", mp_opt->addr_id); + if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { + memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4); + ptr += 4; + if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT || + opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) { + mp_opt->port = get_unaligned_be16(ptr); + ptr += 2; + } + } +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else { + memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16); + ptr += 16; + if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT || + opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) { + mp_opt->port = get_unaligned_be16(ptr); + ptr += 2; + } + } +#endif + if (!mp_opt->echo) { + mp_opt->ahmac = get_unaligned_be64(ptr); + ptr += 8; + } + break; + + case MPTCPOPT_RM_ADDR: + if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE) + break; + + mp_opt->rm_addr = 1; + mp_opt->rm_id = *ptr++; + pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); + break; + default: break; } @@ -231,6 +328,16 @@ bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, opts->sndr_key = subflow->local_key; *size = TCPOLEN_MPTCP_MPC_SYN; return true; + } else if (subflow->request_join) { + pr_debug("remote_token=%u, nonce=%u", subflow->remote_token, + subflow->local_nonce); + opts->suboptions = OPTION_MPTCP_MPJ_SYN; + opts->join_id = subflow->local_id; + opts->token = subflow->remote_token; + opts->nonce = subflow->local_nonce; + opts->backup = subflow->request_bkup; + *size = TCPOLEN_MPTCP_MPJ_SYN; + return true; } return false; } @@ -240,16 +347,55 @@ void mptcp_rcv_synsent(struct sock *sk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct tcp_sock *tp = tcp_sk(sk); - pr_debug("subflow=%p", subflow); if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { subflow->mp_capable = 1; subflow->can_ack = 1; subflow->remote_key = tp->rx_opt.mptcp.sndr_key; - } else { + pr_debug("subflow=%p, remote_key=%llu", subflow, + subflow->remote_key); + } else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) { + subflow->mp_join = 1; + subflow->thmac = tp->rx_opt.mptcp.thmac; + subflow->remote_nonce = tp->rx_opt.mptcp.nonce; + pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, + subflow->thmac, subflow->remote_nonce); + } else if (subflow->request_mptcp) { tcp_sk(sk)->is_mptcp = 0; } } +/* MP_JOIN client subflow must wait for 4th ack before sending any data: + * TCP can't schedule delack timer before the subflow is fully established. + * MPTCP uses the delack timer to do 3rd ack retransmissions + */ +static void schedule_3rdack_retransmission(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + unsigned long timeout; + + /* reschedule with a timeout above RTT, as we must look only for drop */ + if (tp->srtt_us) + timeout = tp->srtt_us << 1; + else + timeout = TCP_TIMEOUT_INIT; + + WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); + icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = timeout; + sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); +} + +static void clear_3rdack_retransmission(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + sk_stop_timer(sk, &icsk->icsk_delack_timer); + icsk->icsk_ack.timeout = 0; + icsk->icsk_ack.ato = 0; + icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER); +} + static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, @@ -259,17 +405,21 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, struct mptcp_ext *mpext; unsigned int data_len; - pr_debug("subflow=%p fully established=%d seq=%x:%x remaining=%d", - subflow, subflow->fully_established, subflow->snd_isn, - skb ? TCP_SKB_CB(skb)->seq : 0, remaining); + /* When skb is not available, we better over-estimate the emitted + * options len. A full DSS option (28 bytes) is longer than + * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so + * tell the caller to defer the estimate to + * mptcp_established_options_dss(), which will reserve enough space. + */ + if (!skb) + return false; - if (subflow->mp_capable && !subflow->fully_established && skb && - subflow->snd_isn == TCP_SKB_CB(skb)->seq) { - /* When skb is not available, we better over-estimate the - * emitted options len. A full DSS option is longer than - * TCPOLEN_MPTCP_MPC_ACK_DATA, so let's the caller try to fit - * that. - */ + /* MPC/MPJ needed only on 3rd ack packet */ + if (subflow->fully_established || + subflow->snd_isn != TCP_SKB_CB(skb)->seq) + return false; + + if (subflow->mp_capable) { mpext = mptcp_get_ext(skb); data_len = mpext ? mpext->data_len : 0; @@ -297,6 +447,14 @@ static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, data_len); return true; + } else if (subflow->mp_join) { + opts->suboptions = OPTION_MPTCP_MPJ_ACK; + memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN); + *size = TCPOLEN_MPTCP_MPJ_ACK; + pr_debug("subflow=%p", subflow); + + schedule_3rdack_retransmission(sk); + return true; } return false; } @@ -386,6 +544,83 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, return true; } +static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id, + struct in_addr *addr) +{ + u8 hmac[MPTCP_ADDR_HMAC_LEN]; + u8 msg[7]; + + msg[0] = addr_id; + memcpy(&msg[1], &addr->s_addr, 4); + msg[5] = 0; + msg[6] = 0; + + mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac); + + return get_unaligned_be64(hmac); +} + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) +static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, + struct in6_addr *addr) +{ + u8 hmac[MPTCP_ADDR_HMAC_LEN]; + u8 msg[19]; + + msg[0] = addr_id; + memcpy(&msg[1], &addr->s6_addr, 16); + msg[17] = 0; + msg[18] = 0; + + mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac); + + return get_unaligned_be64(hmac); +} +#endif + +static bool mptcp_established_options_addr(struct sock *sk, + unsigned int *size, + unsigned int remaining, + struct mptcp_out_options *opts) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + struct mptcp_addr_info saddr; + int len; + + if (!mptcp_pm_should_signal(msk) || + !(mptcp_pm_addr_signal(msk, remaining, &saddr))) + return false; + + len = mptcp_add_addr_len(saddr.family); + if (remaining < len) + return false; + + *size = len; + opts->addr_id = saddr.id; + if (saddr.family == AF_INET) { + opts->suboptions |= OPTION_MPTCP_ADD_ADDR; + opts->addr = saddr.addr; + opts->ahmac = add_addr_generate_hmac(msk->local_key, + msk->remote_key, + opts->addr_id, + &opts->addr); + } +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (saddr.family == AF_INET6) { + opts->suboptions |= OPTION_MPTCP_ADD_ADDR6; + opts->addr6 = saddr.addr6; + opts->ahmac = add_addr6_generate_hmac(msk->local_key, + msk->remote_key, + opts->addr_id, + &opts->addr6); + } +#endif + pr_debug("addr_id=%d, ahmac=%llu", opts->addr_id, opts->ahmac); + + return true; +} + bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) @@ -393,6 +628,8 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int opt_size = 0; bool ret = false; + opts->suboptions = 0; + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, @@ -407,6 +644,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, *size += opt_size; remaining -= opt_size; + if (mptcp_established_options_addr(sk, &opt_size, remaining, opts)) { + *size += opt_size; + remaining -= opt_size; + ret = true; + } return ret; } @@ -423,54 +665,194 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, pr_debug("subflow_req=%p, local_key=%llu", subflow_req, subflow_req->local_key); return true; + } else if (subflow_req->mp_join) { + opts->suboptions = OPTION_MPTCP_MPJ_SYNACK; + opts->backup = subflow_req->backup; + opts->join_id = subflow_req->local_id; + opts->thmac = subflow_req->thmac; + opts->nonce = subflow_req->local_nonce; + pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u", + subflow_req, opts->backup, opts->join_id, + opts->thmac, opts->nonce); + *size = TCPOLEN_MPTCP_MPJ_SYNACK; + return true; } return false; } -static bool check_fully_established(struct mptcp_subflow_context *subflow, +static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, + struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_options_received *mp_opt) { /* here we can process OoO, in-window pkts, only in-sequence 4th ack - * are relevant + * will make the subflow fully established */ - if (likely(subflow->fully_established || - TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1)) - return true; + if (likely(subflow->fully_established)) { + /* on passive sockets, check for 3rd ack retransmission + * note that msk is always set by subflow_syn_recv_sock() + * for mp_join subflows + */ + if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 && + TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && + subflow->mp_join && mp_opt->mp_join && + READ_ONCE(msk->pm.server_side)) + tcp_send_ack(sk); + goto fully_established; + } - if (mp_opt->use_ack) + /* we should process OoO packets before the first subflow is fully + * established, but not expected for MP_JOIN subflows + */ + if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) + return subflow->mp_capable; + + if (mp_opt->use_ack) { + /* subflows are fully established as soon as we get any + * additional ack. + */ subflow->fully_established = 1; + goto fully_established; + } - if (subflow->can_ack) - return true; + WARN_ON_ONCE(subflow->can_ack); /* If the first established packet does not contain MP_CAPABLE + data * then fallback to TCP */ if (!mp_opt->mp_capable) { subflow->mp_capable = 0; - tcp_sk(mptcp_subflow_tcp_sock(subflow))->is_mptcp = 0; + tcp_sk(sk)->is_mptcp = 0; return false; } + + subflow->fully_established = 1; subflow->remote_key = mp_opt->sndr_key; subflow->can_ack = 1; + +fully_established: + if (likely(subflow->pm_notified)) + return true; + + subflow->pm_notified = 1; + if (subflow->mp_join) { + clear_3rdack_retransmission(sk); + mptcp_pm_subflow_established(msk, subflow); + } else { + mptcp_pm_fully_established(msk); + } return true; } +static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) +{ + u32 old_ack32, cur_ack32; + + if (use_64bit) + return cur_ack; + + old_ack32 = (u32)old_ack; + cur_ack32 = (u32)cur_ack; + cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32; + if (unlikely(before(cur_ack32, old_ack32))) + return cur_ack + (1LL << 32); + return cur_ack; +} + +static void update_una(struct mptcp_sock *msk, + struct mptcp_options_received *mp_opt) +{ + u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); + u64 write_seq = READ_ONCE(msk->write_seq); + + /* avoid ack expansion on update conflict, to reduce the risk of + * wrongly expanding to a future ack sequence number, which is way + * more dangerous than missing an ack + */ + new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); + + /* ACK for data not even sent yet? Ignore. */ + if (after64(new_snd_una, write_seq)) + new_snd_una = old_snd_una; + + while (after64(new_snd_una, old_snd_una)) { + snd_una = old_snd_una; + old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una, + new_snd_una); + if (old_snd_una == snd_una) { + mptcp_data_acked((struct sock *)msk); + break; + } + } +} + +static bool add_addr_hmac_valid(struct mptcp_sock *msk, + struct mptcp_options_received *mp_opt) +{ + u64 hmac = 0; + + if (mp_opt->echo) + return true; + + if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) + hmac = add_addr_generate_hmac(msk->remote_key, + msk->local_key, + mp_opt->addr_id, &mp_opt->addr); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else + hmac = add_addr6_generate_hmac(msk->remote_key, + msk->local_key, + mp_opt->addr_id, &mp_opt->addr6); +#endif + + pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", + msk, (unsigned long long)hmac, + (unsigned long long)mp_opt->ahmac); + + return hmac == mp_opt->ahmac; +} + void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, struct tcp_options_received *opt_rx) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); struct mptcp_options_received *mp_opt; struct mptcp_ext *mpext; mp_opt = &opt_rx->mptcp; - if (!check_fully_established(subflow, skb, mp_opt)) + if (!check_fully_established(msk, sk, subflow, skb, mp_opt)) return; + if (mp_opt->add_addr && add_addr_hmac_valid(msk, mp_opt)) { + struct mptcp_addr_info addr; + + addr.port = htons(mp_opt->port); + addr.id = mp_opt->addr_id; + if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { + addr.family = AF_INET; + addr.addr = mp_opt->addr; + } +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (mp_opt->family == MPTCP_ADDR_IPVERSION_6) { + addr.family = AF_INET6; + addr.addr6 = mp_opt->addr6; + } +#endif + if (!mp_opt->echo) + mptcp_pm_add_addr_received(msk, &addr); + mp_opt->add_addr = 0; + } + if (!mp_opt->dss) return; + /* we can't wait for recvmsg() to update the ack_seq, otherwise + * monodirectional flows will stuck + */ + if (mp_opt->use_ack) + update_una(msk, mp_opt); + mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) return; @@ -497,12 +879,6 @@ void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, mpext->use_map = 1; } - if (mp_opt->use_ack) { - mpext->data_ack = mp_opt->data_ack; - mpext->use_ack = 1; - mpext->ack64 = mp_opt->ack64; - } - mpext->data_fin = mp_opt->data_fin; } @@ -521,10 +897,9 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) else len = TCPOLEN_MPTCP_MPC_ACK; - *ptr++ = htonl((TCPOPT_MPTCP << 24) | (len << 16) | - (MPTCPOPT_MP_CAPABLE << 12) | - (MPTCP_SUPPORTED_VERSION << 8) | - MPTCP_CAP_HMAC_SHA256); + *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, + MPTCP_SUPPORTED_VERSION, + MPTCP_CAP_HMAC_SHA256); if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & opts->suboptions)) @@ -546,6 +921,77 @@ void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) } mp_capable_done: + if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { + if (opts->ahmac) + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + TCPOLEN_MPTCP_ADD_ADDR, 0, + opts->addr_id); + else + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + TCPOLEN_MPTCP_ADD_ADDR_BASE, + MPTCP_ADDR_ECHO, + opts->addr_id); + memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); + ptr += 1; + if (opts->ahmac) { + put_unaligned_be64(opts->ahmac, ptr); + ptr += 2; + } + } + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { + if (opts->ahmac) + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + TCPOLEN_MPTCP_ADD_ADDR6, 0, + opts->addr_id); + else + *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, + TCPOLEN_MPTCP_ADD_ADDR6_BASE, + MPTCP_ADDR_ECHO, + opts->addr_id); + memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); + ptr += 4; + if (opts->ahmac) { + put_unaligned_be64(opts->ahmac, ptr); + ptr += 2; + } + } +#endif + + if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, + TCPOLEN_MPTCP_RM_ADDR_BASE, + 0, opts->rm_id); + } + + if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, + TCPOLEN_MPTCP_MPJ_SYN, + opts->backup, opts->join_id); + put_unaligned_be32(opts->token, ptr); + ptr += 1; + put_unaligned_be32(opts->nonce, ptr); + ptr += 1; + } + + if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, + TCPOLEN_MPTCP_MPJ_SYNACK, + opts->backup, opts->join_id); + put_unaligned_be64(opts->thmac, ptr); + ptr += 2; + put_unaligned_be32(opts->nonce, ptr); + ptr += 1; + } + + if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) { + *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, + TCPOLEN_MPTCP_MPJ_ACK, 0, 0); + memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); + ptr += 5; + } + if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { struct mptcp_ext *mpext = &opts->ext_copy; u8 len = TCPOLEN_MPTCP_DSS_BASE; @@ -567,10 +1013,7 @@ mp_capable_done: flags |= MPTCP_DSS_DATA_FIN; } - *ptr++ = htonl((TCPOPT_MPTCP << 24) | - (len << 16) | - (MPTCPOPT_DSS << 12) | - (flags)); + *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); if (mpext->use_ack) { put_unaligned_be64(mpext->data_ack, ptr); diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c new file mode 100644 index 000000000000..064639f72487 --- /dev/null +++ b/net/mptcp/pm.c @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2019, Intel Corporation. + */ +#include +#include +#include +#include "protocol.h" + +static struct workqueue_struct *pm_wq; + +/* path manager command handlers */ + +int mptcp_pm_announce_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + pr_debug("msk=%p, local_id=%d", msk, addr->id); + + msk->pm.local = *addr; + WRITE_ONCE(msk->pm.addr_signal, true); + return 0; +} + +int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id) +{ + return -ENOTSUPP; +} + +int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id) +{ + return -ENOTSUPP; +} + +/* path manager event handlers */ + +void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side) +{ + struct mptcp_pm_data *pm = &msk->pm; + + pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side); + + WRITE_ONCE(pm->server_side, server_side); +} + +bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + int ret; + + pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows, + pm->subflows_max, READ_ONCE(pm->accept_subflow)); + + /* try to avoid acquiring the lock below */ + if (!READ_ONCE(pm->accept_subflow)) + return false; + + spin_lock_bh(&pm->lock); + ret = pm->subflows < pm->subflows_max; + if (ret && ++pm->subflows == pm->subflows_max) + WRITE_ONCE(pm->accept_subflow, false); + spin_unlock_bh(&pm->lock); + + return ret; +} + +/* return true if the new status bit is currently cleared, that is, this event + * can be server, eventually by an already scheduled work + */ +static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, + enum mptcp_pm_status new_status) +{ + pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status, + BIT(new_status)); + if (msk->pm.status & BIT(new_status)) + return false; + + msk->pm.status |= BIT(new_status); + if (queue_work(pm_wq, &msk->pm.work)) + sock_hold((struct sock *)msk); + return true; +} + +void mptcp_pm_fully_established(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + + pr_debug("msk=%p", msk); + + /* try to avoid acquiring the lock below */ + if (!READ_ONCE(pm->work_pending)) + return; + + spin_lock_bh(&pm->lock); + + if (READ_ONCE(pm->work_pending)) + mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED); + + spin_unlock_bh(&pm->lock); +} + +void mptcp_pm_connection_closed(struct mptcp_sock *msk) +{ + pr_debug("msk=%p", msk); +} + +void mptcp_pm_subflow_established(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) +{ + struct mptcp_pm_data *pm = &msk->pm; + + pr_debug("msk=%p", msk); + + if (!READ_ONCE(pm->work_pending)) + return; + + spin_lock_bh(&pm->lock); + + if (READ_ONCE(pm->work_pending)) + mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); + + spin_unlock_bh(&pm->lock); +} + +void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id) +{ + pr_debug("msk=%p", msk); +} + +void mptcp_pm_add_addr_received(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) +{ + struct mptcp_pm_data *pm = &msk->pm; + + pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id, + READ_ONCE(pm->accept_addr)); + + /* avoid acquiring the lock if there is no room for fouther addresses */ + if (!READ_ONCE(pm->accept_addr)) + return; + + spin_lock_bh(&pm->lock); + + /* be sure there is something to signal re-checking under PM lock */ + if (READ_ONCE(pm->accept_addr) && + mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) + pm->remote = *addr; + + spin_unlock_bh(&pm->lock); +} + +/* path manager helpers */ + +bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, + struct mptcp_addr_info *saddr) +{ + int ret = false; + + spin_lock_bh(&msk->pm.lock); + + /* double check after the lock is acquired */ + if (!mptcp_pm_should_signal(msk)) + goto out_unlock; + + if (remaining < mptcp_add_addr_len(msk->pm.local.family)) + goto out_unlock; + + *saddr = msk->pm.local; + WRITE_ONCE(msk->pm.addr_signal, false); + ret = true; + +out_unlock: + spin_unlock_bh(&msk->pm.lock); + return ret; +} + +int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) +{ + return mptcp_pm_nl_get_local_id(msk, skc); +} + +static void pm_worker(struct work_struct *work) +{ + struct mptcp_pm_data *pm = container_of(work, struct mptcp_pm_data, + work); + struct mptcp_sock *msk = container_of(pm, struct mptcp_sock, pm); + struct sock *sk = (struct sock *)msk; + + lock_sock(sk); + spin_lock_bh(&msk->pm.lock); + + pr_debug("msk=%p status=%x", msk, pm->status); + if (pm->status & BIT(MPTCP_PM_ADD_ADDR_RECEIVED)) { + pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_RECEIVED); + mptcp_pm_nl_add_addr_received(msk); + } + if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); + mptcp_pm_nl_fully_established(msk); + } + if (pm->status & BIT(MPTCP_PM_SUBFLOW_ESTABLISHED)) { + pm->status &= ~BIT(MPTCP_PM_SUBFLOW_ESTABLISHED); + mptcp_pm_nl_subflow_established(msk); + } + + spin_unlock_bh(&msk->pm.lock); + release_sock(sk); + sock_put(sk); +} + +void mptcp_pm_data_init(struct mptcp_sock *msk) +{ + msk->pm.add_addr_signaled = 0; + msk->pm.add_addr_accepted = 0; + msk->pm.local_addr_used = 0; + msk->pm.subflows = 0; + WRITE_ONCE(msk->pm.work_pending, false); + WRITE_ONCE(msk->pm.addr_signal, false); + WRITE_ONCE(msk->pm.accept_addr, false); + WRITE_ONCE(msk->pm.accept_subflow, false); + msk->pm.status = 0; + + spin_lock_init(&msk->pm.lock); + INIT_WORK(&msk->pm.work, pm_worker); + + mptcp_pm_nl_data_init(msk); +} + +void mptcp_pm_close(struct mptcp_sock *msk) +{ + if (cancel_work_sync(&msk->pm.work)) + sock_put((struct sock *)msk); +} + +void mptcp_pm_init(void) +{ + pm_wq = alloc_workqueue("pm_wq", WQ_UNBOUND | WQ_MEM_RECLAIM, 8); + if (!pm_wq) + panic("Failed to allocate workqueue"); + + mptcp_pm_nl_init(); +} diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c new file mode 100644 index 000000000000..a0ce7f324499 --- /dev/null +++ b/net/mptcp/pm_netlink.c @@ -0,0 +1,857 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Multipath TCP + * + * Copyright (c) 2020, Red Hat, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "protocol.h" + +/* forward declaration */ +static struct genl_family mptcp_genl_family; + +static int pm_nl_pernet_id; + +struct mptcp_pm_addr_entry { + struct list_head list; + unsigned int flags; + int ifindex; + struct mptcp_addr_info addr; + struct rcu_head rcu; +}; + +struct pm_nl_pernet { + /* protects pernet updates */ + spinlock_t lock; + struct list_head local_addr_list; + unsigned int addrs; + unsigned int add_addr_signal_max; + unsigned int add_addr_accept_max; + unsigned int local_addr_max; + unsigned int subflows_max; + unsigned int next_id; +}; + +#define MPTCP_PM_ADDR_MAX 8 + +static bool addresses_equal(const struct mptcp_addr_info *a, + struct mptcp_addr_info *b, bool use_port) +{ + bool addr_equals = false; + + if (a->family != b->family) + return false; + + if (a->family == AF_INET) + addr_equals = a->addr.s_addr == b->addr.s_addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else + addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6); +#endif + + if (!addr_equals) + return false; + if (!use_port) + return true; + + return a->port == b->port; +} + +static void local_address(const struct sock_common *skc, + struct mptcp_addr_info *addr) +{ + addr->port = 0; + addr->family = skc->skc_family; + if (addr->family == AF_INET) + addr->addr.s_addr = skc->skc_rcv_saddr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) + addr->addr6 = skc->skc_v6_rcv_saddr; +#endif +} + +static void remote_address(const struct sock_common *skc, + struct mptcp_addr_info *addr) +{ + addr->family = skc->skc_family; + addr->port = skc->skc_dport; + if (addr->family == AF_INET) + addr->addr.s_addr = skc->skc_daddr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) + addr->addr6 = skc->skc_v6_daddr; +#endif +} + +static bool lookup_subflow_by_saddr(const struct list_head *list, + struct mptcp_addr_info *saddr) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_addr_info cur; + struct sock_common *skc; + + list_for_each_entry(subflow, list, node) { + skc = (struct sock_common *)mptcp_subflow_tcp_sock(subflow); + + local_address(skc, &cur); + if (addresses_equal(&cur, saddr, false)) + return true; + } + + return false; +} + +static struct mptcp_pm_addr_entry * +select_local_address(const struct pm_nl_pernet *pernet, + struct mptcp_sock *msk) +{ + struct mptcp_pm_addr_entry *entry, *ret = NULL; + + rcu_read_lock(); + spin_lock_bh(&msk->join_list_lock); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) + continue; + + /* avoid any address already in use by subflows and + * pending join + */ + if (entry->addr.family == ((struct sock *)msk)->sk_family && + !lookup_subflow_by_saddr(&msk->conn_list, &entry->addr) && + !lookup_subflow_by_saddr(&msk->join_list, &entry->addr)) { + ret = entry; + break; + } + } + spin_unlock_bh(&msk->join_list_lock); + rcu_read_unlock(); + return ret; +} + +static struct mptcp_pm_addr_entry * +select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos) +{ + struct mptcp_pm_addr_entry *entry, *ret = NULL; + int i = 0; + + rcu_read_lock(); + /* do not keep any additional per socket state, just signal + * the address list in order. + * Note: removal from the local address list during the msk life-cycle + * can lead to additional addresses not being announced. + */ + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) + continue; + if (i++ == pos) { + ret = entry; + break; + } + } + rcu_read_unlock(); + return ret; +} + +static void check_work_pending(struct mptcp_sock *msk) +{ + if (msk->pm.add_addr_signaled == msk->pm.add_addr_signal_max && + (msk->pm.local_addr_used == msk->pm.local_addr_max || + msk->pm.subflows == msk->pm.subflows_max)) + WRITE_ONCE(msk->pm.work_pending, false); +} + +static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *local; + struct mptcp_addr_info remote; + struct pm_nl_pernet *pernet; + + pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + + pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", + msk->pm.local_addr_used, msk->pm.local_addr_max, + msk->pm.add_addr_signaled, msk->pm.add_addr_signal_max, + msk->pm.subflows, msk->pm.subflows_max); + + /* check first for announce */ + if (msk->pm.add_addr_signaled < msk->pm.add_addr_signal_max) { + local = select_signal_address(pernet, + msk->pm.add_addr_signaled); + + if (local) { + msk->pm.add_addr_signaled++; + mptcp_pm_announce_addr(msk, &local->addr); + } else { + /* pick failed, avoid fourther attempts later */ + msk->pm.local_addr_used = msk->pm.add_addr_signal_max; + } + + check_work_pending(msk); + } + + /* check if should create a new subflow */ + if (msk->pm.local_addr_used < msk->pm.local_addr_max && + msk->pm.subflows < msk->pm.subflows_max) { + remote_address((struct sock_common *)sk, &remote); + + local = select_local_address(pernet, msk); + if (local) { + msk->pm.local_addr_used++; + msk->pm.subflows++; + check_work_pending(msk); + spin_unlock_bh(&msk->pm.lock); + __mptcp_subflow_connect(sk, local->ifindex, + &local->addr, &remote); + spin_lock_bh(&msk->pm.lock); + return; + } + + /* lookup failed, avoid fourther attempts later */ + msk->pm.local_addr_used = msk->pm.local_addr_max; + check_work_pending(msk); + } +} + +void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) +{ + mptcp_pm_create_subflow_or_signal_addr(msk); +} + +void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) +{ + mptcp_pm_create_subflow_or_signal_addr(msk); +} + +void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + struct mptcp_addr_info remote; + struct mptcp_addr_info local; + + pr_debug("accepted %d:%d remote family %d", + msk->pm.add_addr_accepted, msk->pm.add_addr_accept_max, + msk->pm.remote.family); + msk->pm.add_addr_accepted++; + msk->pm.subflows++; + if (msk->pm.add_addr_accepted >= msk->pm.add_addr_accept_max || + msk->pm.subflows >= msk->pm.subflows_max) + WRITE_ONCE(msk->pm.accept_addr, false); + + /* connect to the specified remote address, using whatever + * local address the routing configuration will pick. + */ + remote = msk->pm.remote; + if (!remote.port) + remote.port = sk->sk_dport; + memset(&local, 0, sizeof(local)); + local.family = remote.family; + + spin_unlock_bh(&msk->pm.lock); + __mptcp_subflow_connect((struct sock *)msk, 0, &local, &remote); + spin_lock_bh(&msk->pm.lock); +} + +static bool address_use_port(struct mptcp_pm_addr_entry *entry) +{ + return (entry->flags & + (MPTCP_PM_ADDR_FLAG_SIGNAL | MPTCP_PM_ADDR_FLAG_SUBFLOW)) == + MPTCP_PM_ADDR_FLAG_SIGNAL; +} + +static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, + struct mptcp_pm_addr_entry *entry) +{ + struct mptcp_pm_addr_entry *cur; + int ret = -EINVAL; + + spin_lock_bh(&pernet->lock); + /* to keep the code simple, don't do IDR-like allocation for address ID, + * just bail when we exceed limits + */ + if (pernet->next_id > 255) + goto out; + if (pernet->addrs >= MPTCP_PM_ADDR_MAX) + goto out; + + /* do not insert duplicate address, differentiate on port only + * singled addresses + */ + list_for_each_entry(cur, &pernet->local_addr_list, list) { + if (addresses_equal(&cur->addr, &entry->addr, + address_use_port(entry) && + address_use_port(cur))) + goto out; + } + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) + pernet->add_addr_signal_max++; + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) + pernet->local_addr_max++; + + entry->addr.id = pernet->next_id++; + pernet->addrs++; + list_add_tail_rcu(&entry->list, &pernet->local_addr_list); + ret = entry->addr.id; + +out: + spin_unlock_bh(&pernet->lock); + return ret; +} + +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) +{ + struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info skc_local; + struct mptcp_addr_info msk_local; + struct pm_nl_pernet *pernet; + int ret = -1; + + if (WARN_ON_ONCE(!msk)) + return -1; + + /* The 0 ID mapping is defined by the first subflow, copied into the msk + * addr + */ + local_address((struct sock_common *)msk, &msk_local); + local_address((struct sock_common *)msk, &skc_local); + if (addresses_equal(&msk_local, &skc_local, false)) + return 0; + + pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + if (addresses_equal(&entry->addr, &skc_local, false)) { + ret = entry->addr.id; + break; + } + } + rcu_read_unlock(); + if (ret >= 0) + return ret; + + /* address not found, add to local list */ + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->flags = 0; + entry->addr = skc_local; + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); + if (ret < 0) + kfree(entry); + + return ret; +} + +void mptcp_pm_nl_data_init(struct mptcp_sock *msk) +{ + struct mptcp_pm_data *pm = &msk->pm; + struct pm_nl_pernet *pernet; + bool subflows; + + pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); + + pm->add_addr_signal_max = READ_ONCE(pernet->add_addr_signal_max); + pm->add_addr_accept_max = READ_ONCE(pernet->add_addr_accept_max); + pm->local_addr_max = READ_ONCE(pernet->local_addr_max); + pm->subflows_max = READ_ONCE(pernet->subflows_max); + subflows = !!pm->subflows_max; + WRITE_ONCE(pm->work_pending, (!!pm->local_addr_max && subflows) || + !!pm->add_addr_signal_max); + WRITE_ONCE(pm->accept_addr, !!pm->add_addr_accept_max && subflows); + WRITE_ONCE(pm->accept_subflow, subflows); +} + +#define MPTCP_PM_CMD_GRP_OFFSET 0 + +static const struct genl_multicast_group mptcp_pm_mcgrps[] = { + [MPTCP_PM_CMD_GRP_OFFSET] = { .name = MPTCP_PM_CMD_GRP_NAME, }, +}; + +static const struct nla_policy +mptcp_pm_addr_policy[MPTCP_PM_ADDR_ATTR_MAX + 1] = { + [MPTCP_PM_ADDR_ATTR_FAMILY] = { .type = NLA_U16, }, + [MPTCP_PM_ADDR_ATTR_ID] = { .type = NLA_U8, }, + [MPTCP_PM_ADDR_ATTR_ADDR4] = { .type = NLA_U32, }, + [MPTCP_PM_ADDR_ATTR_ADDR6] = { .type = NLA_EXACT_LEN, + .len = sizeof(struct in6_addr), }, + [MPTCP_PM_ADDR_ATTR_PORT] = { .type = NLA_U16 }, + [MPTCP_PM_ADDR_ATTR_FLAGS] = { .type = NLA_U32 }, + [MPTCP_PM_ADDR_ATTR_IF_IDX] = { .type = NLA_S32 }, +}; + +static const struct nla_policy mptcp_pm_policy[MPTCP_PM_ATTR_MAX + 1] = { + [MPTCP_PM_ATTR_ADDR] = + NLA_POLICY_NESTED(mptcp_pm_addr_policy), + [MPTCP_PM_ATTR_RCV_ADD_ADDRS] = { .type = NLA_U32, }, + [MPTCP_PM_ATTR_SUBFLOWS] = { .type = NLA_U32, }, +}; + +static int mptcp_pm_family_to_addr(int family) +{ +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (family == AF_INET6) + return MPTCP_PM_ADDR_ATTR_ADDR6; +#endif + return MPTCP_PM_ADDR_ATTR_ADDR4; +} + +static int mptcp_pm_parse_addr(struct nlattr *attr, struct genl_info *info, + bool require_family, + struct mptcp_pm_addr_entry *entry) +{ + struct nlattr *tb[MPTCP_PM_ADDR_ATTR_MAX + 1]; + int err, addr_addr; + + if (!attr) { + GENL_SET_ERR_MSG(info, "missing address info"); + return -EINVAL; + } + + /* no validation needed - was already done via nested policy */ + err = nla_parse_nested_deprecated(tb, MPTCP_PM_ADDR_ATTR_MAX, attr, + mptcp_pm_addr_policy, info->extack); + if (err) + return err; + + memset(entry, 0, sizeof(*entry)); + if (!tb[MPTCP_PM_ADDR_ATTR_FAMILY]) { + if (!require_family) + goto skip_family; + + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "missing family"); + return -EINVAL; + } + + entry->addr.family = nla_get_u16(tb[MPTCP_PM_ADDR_ATTR_FAMILY]); + if (entry->addr.family != AF_INET +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + && entry->addr.family != AF_INET6 +#endif + ) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "unknown address family"); + return -EINVAL; + } + addr_addr = mptcp_pm_family_to_addr(entry->addr.family); + if (!tb[addr_addr]) { + NL_SET_ERR_MSG_ATTR(info->extack, attr, + "missing address data"); + return -EINVAL; + } + +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (entry->addr.family == AF_INET6) + entry->addr.addr6 = nla_get_in6_addr(tb[addr_addr]); + else +#endif + entry->addr.addr.s_addr = nla_get_in_addr(tb[addr_addr]); + +skip_family: + if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) + entry->ifindex = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); + + if (tb[MPTCP_PM_ADDR_ATTR_ID]) + entry->addr.id = nla_get_u8(tb[MPTCP_PM_ADDR_ATTR_ID]); + + if (tb[MPTCP_PM_ADDR_ATTR_FLAGS]) + entry->flags = nla_get_u32(tb[MPTCP_PM_ADDR_ATTR_FLAGS]); + + return 0; +} + +static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) +{ + return net_generic(genl_info_net(info), pm_nl_pernet_id); +} + +static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + int ret; + + ret = mptcp_pm_parse_addr(attr, info, true, &addr); + if (ret < 0) + return ret; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + GENL_SET_ERR_MSG(info, "can't allocate addr"); + return -ENOMEM; + } + + *entry = addr; + ret = mptcp_pm_nl_append_new_local_addr(pernet, entry); + if (ret < 0) { + GENL_SET_ERR_MSG(info, "too many addresses or duplicate one"); + kfree(entry); + return ret; + } + + return 0; +} + +static struct mptcp_pm_addr_entry * +__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) +{ + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry(entry, &pernet->local_addr_list, list) { + if (entry->addr.id == id) + return entry; + } + return NULL; +} + +static int mptcp_nl_cmd_del_addr(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + int ret; + + ret = mptcp_pm_parse_addr(attr, info, false, &addr); + if (ret < 0) + return ret; + + spin_lock_bh(&pernet->lock); + entry = __lookup_addr_by_id(pernet, addr.addr.id); + if (!entry) { + GENL_SET_ERR_MSG(info, "address not found"); + ret = -EINVAL; + goto out; + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) + pernet->add_addr_signal_max--; + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) + pernet->local_addr_max--; + + pernet->addrs--; + list_del_rcu(&entry->list); + kfree_rcu(entry, rcu); +out: + spin_unlock_bh(&pernet->lock); + return ret; +} + +static void __flush_addrs(struct pm_nl_pernet *pernet) +{ + while (!list_empty(&pernet->local_addr_list)) { + struct mptcp_pm_addr_entry *cur; + + cur = list_entry(pernet->local_addr_list.next, + struct mptcp_pm_addr_entry, list); + list_del_rcu(&cur->list); + kfree_rcu(cur, rcu); + } +} + +static void __reset_counters(struct pm_nl_pernet *pernet) +{ + pernet->add_addr_signal_max = 0; + pernet->add_addr_accept_max = 0; + pernet->local_addr_max = 0; + pernet->addrs = 0; +} + +static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + + spin_lock_bh(&pernet->lock); + __flush_addrs(pernet); + __reset_counters(pernet); + spin_unlock_bh(&pernet->lock); + return 0; +} + +static int mptcp_nl_fill_addr(struct sk_buff *skb, + struct mptcp_pm_addr_entry *entry) +{ + struct mptcp_addr_info *addr = &entry->addr; + struct nlattr *attr; + + attr = nla_nest_start(skb, MPTCP_PM_ATTR_ADDR); + if (!attr) + return -EMSGSIZE; + + if (nla_put_u16(skb, MPTCP_PM_ADDR_ATTR_FAMILY, addr->family)) + goto nla_put_failure; + if (nla_put_u8(skb, MPTCP_PM_ADDR_ATTR_ID, addr->id)) + goto nla_put_failure; + if (nla_put_u32(skb, MPTCP_PM_ADDR_ATTR_FLAGS, entry->flags)) + goto nla_put_failure; + if (entry->ifindex && + nla_put_s32(skb, MPTCP_PM_ADDR_ATTR_IF_IDX, entry->ifindex)) + goto nla_put_failure; + + if (addr->family == AF_INET) + nla_put_in_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR4, + addr->addr.s_addr); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->family == AF_INET6) + nla_put_in6_addr(skb, MPTCP_PM_ADDR_ATTR_ADDR6, &addr->addr6); +#endif + nla_nest_end(skb, attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, attr); + return -EMSGSIZE; +} + +static int mptcp_nl_cmd_get_addr(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct mptcp_pm_addr_entry addr, *entry; + struct sk_buff *msg; + void *reply; + int ret; + + ret = mptcp_pm_parse_addr(attr, info, false, &addr); + if (ret < 0) + return ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, + info->genlhdr->cmd); + if (!reply) { + GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); + ret = -EMSGSIZE; + goto fail; + } + + spin_lock_bh(&pernet->lock); + entry = __lookup_addr_by_id(pernet, addr.addr.id); + if (!entry) { + GENL_SET_ERR_MSG(info, "address not found"); + ret = -EINVAL; + goto unlock_fail; + } + + ret = mptcp_nl_fill_addr(msg, entry); + if (ret) + goto unlock_fail; + + genlmsg_end(msg, reply); + ret = genlmsg_reply(msg, info); + spin_unlock_bh(&pernet->lock); + return ret; + +unlock_fail: + spin_unlock_bh(&pernet->lock); + +fail: + nlmsg_free(msg); + return ret; +} + +static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct net *net = sock_net(msg->sk); + struct mptcp_pm_addr_entry *entry; + struct pm_nl_pernet *pernet; + int id = cb->args[0]; + void *hdr; + + pernet = net_generic(net, pm_nl_pernet_id); + + spin_lock_bh(&pernet->lock); + list_for_each_entry(entry, &pernet->local_addr_list, list) { + if (entry->addr.id <= id) + continue; + + hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &mptcp_genl_family, + NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); + if (!hdr) + break; + + if (mptcp_nl_fill_addr(msg, entry) < 0) { + genlmsg_cancel(msg, hdr); + break; + } + + id = entry->addr.id; + genlmsg_end(msg, hdr); + } + spin_unlock_bh(&pernet->lock); + + cb->args[0] = id; + return msg->len; +} + +static int parse_limit(struct genl_info *info, int id, unsigned int *limit) +{ + struct nlattr *attr = info->attrs[id]; + + if (!attr) + return 0; + + *limit = nla_get_u32(attr); + if (*limit > MPTCP_PM_ADDR_MAX) { + GENL_SET_ERR_MSG(info, "limit greater than maximum"); + return -EINVAL; + } + return 0; +} + +static int +mptcp_nl_cmd_set_limits(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + unsigned int rcv_addrs, subflows; + int ret; + + spin_lock_bh(&pernet->lock); + rcv_addrs = pernet->add_addr_accept_max; + ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); + if (ret) + goto unlock; + + subflows = pernet->subflows_max; + ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); + if (ret) + goto unlock; + + WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); + WRITE_ONCE(pernet->subflows_max, subflows); + +unlock: + spin_unlock_bh(&pernet->lock); + return ret; +} + +static int +mptcp_nl_cmd_get_limits(struct sk_buff *skb, struct genl_info *info) +{ + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); + struct sk_buff *msg; + void *reply; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + reply = genlmsg_put_reply(msg, info, &mptcp_genl_family, 0, + MPTCP_PM_CMD_GET_LIMITS); + if (!reply) + goto fail; + + if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, + READ_ONCE(pernet->add_addr_accept_max))) + goto fail; + + if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, + READ_ONCE(pernet->subflows_max))) + goto fail; + + genlmsg_end(msg, reply); + return genlmsg_reply(msg, info); + +fail: + GENL_SET_ERR_MSG(info, "not enough space in Netlink message"); + nlmsg_free(msg); + return -EMSGSIZE; +} + +static struct genl_ops mptcp_pm_ops[] = { + { + .cmd = MPTCP_PM_CMD_ADD_ADDR, + .doit = mptcp_nl_cmd_add_addr, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_DEL_ADDR, + .doit = mptcp_nl_cmd_del_addr, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_FLUSH_ADDRS, + .doit = mptcp_nl_cmd_flush_addrs, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_GET_ADDR, + .doit = mptcp_nl_cmd_get_addr, + .dumpit = mptcp_nl_cmd_dump_addrs, + }, + { + .cmd = MPTCP_PM_CMD_SET_LIMITS, + .doit = mptcp_nl_cmd_set_limits, + .flags = GENL_ADMIN_PERM, + }, + { + .cmd = MPTCP_PM_CMD_GET_LIMITS, + .doit = mptcp_nl_cmd_get_limits, + }, +}; + +static struct genl_family mptcp_genl_family __ro_after_init = { + .name = MPTCP_PM_NAME, + .version = MPTCP_PM_VER, + .maxattr = MPTCP_PM_ATTR_MAX, + .policy = mptcp_pm_policy, + .netnsok = true, + .module = THIS_MODULE, + .ops = mptcp_pm_ops, + .n_ops = ARRAY_SIZE(mptcp_pm_ops), + .mcgrps = mptcp_pm_mcgrps, + .n_mcgrps = ARRAY_SIZE(mptcp_pm_mcgrps), +}; + +static int __net_init pm_nl_init_net(struct net *net) +{ + struct pm_nl_pernet *pernet = net_generic(net, pm_nl_pernet_id); + + INIT_LIST_HEAD_RCU(&pernet->local_addr_list); + __reset_counters(pernet); + pernet->next_id = 1; + spin_lock_init(&pernet->lock); + return 0; +} + +static void __net_exit pm_nl_exit_net(struct list_head *net_list) +{ + struct net *net; + + list_for_each_entry(net, net_list, exit_list) { + /* net is removed from namespace list, can't race with + * other modifiers + */ + __flush_addrs(net_generic(net, pm_nl_pernet_id)); + } +} + +static struct pernet_operations mptcp_pm_pernet_ops = { + .init = pm_nl_init_net, + .exit_batch = pm_nl_exit_net, + .id = &pm_nl_pernet_id, + .size = sizeof(struct pm_nl_pernet), +}; + +void mptcp_pm_nl_init(void) +{ + if (register_pernet_subsys(&mptcp_pm_pernet_ops) < 0) + panic("Failed to register MPTCP PM pernet subsystem.\n"); + + if (genl_register_family(&mptcp_genl_family)) + panic("Failed to register MPTCP PM netlink family\n"); +} diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 92d5382e71f4..1833bc1f4a43 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -21,6 +21,7 @@ #endif #include #include "protocol.h" +#include "mib.h" #define MPTCP_SAME_STATE TCP_MAX_STATES @@ -37,6 +38,8 @@ struct mptcp_skb_cb { #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) +static struct percpu_counter mptcp_sockets_allocated; + /* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not * completed yet or has failed, return the subflow socket. * Otherwise return NULL. @@ -104,19 +107,6 @@ set_state: return ssock; } -static struct sock *mptcp_subflow_get(const struct mptcp_sock *msk) -{ - struct mptcp_subflow_context *subflow; - - sock_owned_by_me((const struct sock *)msk); - - mptcp_for_each_subflow(msk, subflow) { - return mptcp_subflow_tcp_sock(subflow); - } - - return NULL; -} - static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, struct sk_buff *skb, unsigned int offset, size_t copy_len) @@ -254,6 +244,60 @@ wake: sk->sk_data_ready(sk); } +static void __mptcp_flush_join_list(struct mptcp_sock *msk) +{ + if (likely(list_empty(&msk->join_list))) + return; + + spin_lock_bh(&msk->join_list_lock); + list_splice_tail_init(&msk->join_list, &msk->conn_list); + spin_unlock_bh(&msk->join_list_lock); +} + +static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk) +{ + long tout = ssk && inet_csk(ssk)->icsk_pending ? + inet_csk(ssk)->icsk_timeout - jiffies : 0; + + if (tout <= 0) + tout = mptcp_sk(sk)->timer_ival; + mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN; +} + +static bool mptcp_timer_pending(struct sock *sk) +{ + return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); +} + +static void mptcp_reset_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + unsigned long tout; + + /* should never be called with mptcp level timer cleared */ + tout = READ_ONCE(mptcp_sk(sk)->timer_ival); + if (WARN_ON_ONCE(!tout)) + tout = TCP_RTO_MIN; + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); +} + +void mptcp_data_acked(struct sock *sk) +{ + mptcp_reset_timer(sk); + + if (!sk_stream_is_writeable(sk) && + schedule_work(&mptcp_sk(sk)->work)) + sock_hold(sk); +} + +static void mptcp_stop_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + mptcp_sk(sk)->timer_ival = 0; +} + static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) { if (!msk->cached_ext) @@ -277,41 +321,149 @@ static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) return NULL; } -static inline bool mptcp_skb_can_collapse_to(const struct mptcp_sock *msk, - const struct sk_buff *skb, - const struct mptcp_ext *mpext) +static bool mptcp_skb_can_collapse_to(u64 write_seq, + const struct sk_buff *skb, + const struct mptcp_ext *mpext) { if (!tcp_skb_can_collapse_to(skb)) return false; /* can collapse only if MPTCP level sequence is in order */ - return mpext && mpext->data_seq + mpext->data_len == msk->write_seq; + return mpext && mpext->data_seq + mpext->data_len == write_seq; +} + +static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk, + const struct page_frag *pfrag, + const struct mptcp_data_frag *df) +{ + return df && pfrag->page == df->page && + df->data_seq + df->data_len == msk->write_seq; +} + +static void dfrag_uncharge(struct sock *sk, int len) +{ + sk_mem_uncharge(sk, len); + sk_wmem_queued_add(sk, -len); +} + +static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) +{ + int len = dfrag->data_len + dfrag->overhead; + + list_del(&dfrag->list); + dfrag_uncharge(sk, len); + put_page(dfrag->page); +} + +static void mptcp_clean_una(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_data_frag *dtmp, *dfrag; + u64 snd_una = atomic64_read(&msk->snd_una); + bool cleaned = false; + + list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) { + if (after64(dfrag->data_seq + dfrag->data_len, snd_una)) + break; + + dfrag_clear(sk, dfrag); + cleaned = true; + } + + dfrag = mptcp_rtx_head(sk); + if (dfrag && after64(snd_una, dfrag->data_seq)) { + u64 delta = dfrag->data_seq + dfrag->data_len - snd_una; + + dfrag->data_seq += delta; + dfrag->data_len -= delta; + + dfrag_uncharge(sk, delta); + cleaned = true; + } + + if (cleaned) { + sk_mem_reclaim_partial(sk); + + /* Only wake up writers if a subflow is ready */ + if (test_bit(MPTCP_SEND_SPACE, &msk->flags)) + sk_stream_write_space(sk); + } +} + +/* ensure we get enough memory for the frag hdr, beyond some minimal amount of + * data + */ +static bool mptcp_page_frag_refill(struct sock *sk, struct page_frag *pfrag) +{ + if (likely(skb_page_frag_refill(32U + sizeof(struct mptcp_data_frag), + pfrag, sk->sk_allocation))) + return true; + + sk->sk_prot->enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return false; +} + +static struct mptcp_data_frag * +mptcp_carve_data_frag(const struct mptcp_sock *msk, struct page_frag *pfrag, + int orig_offset) +{ + int offset = ALIGN(orig_offset, sizeof(long)); + struct mptcp_data_frag *dfrag; + + dfrag = (struct mptcp_data_frag *)(page_to_virt(pfrag->page) + offset); + dfrag->data_len = 0; + dfrag->data_seq = msk->write_seq; + dfrag->overhead = offset - orig_offset + sizeof(struct mptcp_data_frag); + dfrag->offset = offset + sizeof(struct mptcp_data_frag); + dfrag->page = pfrag->page; + + return dfrag; } static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, - struct msghdr *msg, long *timeo, int *pmss_now, + struct msghdr *msg, struct mptcp_data_frag *dfrag, + long *timeo, int *pmss_now, int *ps_goal) { - int mss_now, avail_size, size_goal, ret; + int mss_now, avail_size, size_goal, offset, ret, frag_truesize = 0; + bool dfrag_collapsed, can_collapse = false; struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_ext *mpext = NULL; + bool retransmission = !!dfrag; struct sk_buff *skb, *tail; - bool can_collapse = false; struct page_frag *pfrag; + struct page *page; + u64 *write_seq; size_t psize; /* use the mptcp page cache so that we can easily move the data * from one substream to another, but do per subflow memory accounting + * Note: pfrag is used only !retransmission, but the compiler if + * fooled into a warning if we don't init here */ pfrag = sk_page_frag(sk); - while (!sk_page_frag_refill(ssk, pfrag) || + while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) || !mptcp_ext_cache_refill(msk)) { ret = sk_stream_wait_memory(ssk, timeo); if (ret) return ret; + + /* if sk_stream_wait_memory() sleeps snd_una can change + * significantly, refresh the rtx queue + */ + mptcp_clean_una(sk); + if (unlikely(__mptcp_needs_tcp_fallback(msk))) return 0; } + if (!retransmission) { + write_seq = &msk->write_seq; + page = pfrag->page; + } else { + write_seq = &dfrag->data_seq; + page = dfrag->page; + } /* compute copy limit */ mss_now = tcp_send_mss(ssk, &size_goal, msg->msg_flags); @@ -329,32 +481,74 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * SSN association set here */ can_collapse = (size_goal - skb->len > 0) && - mptcp_skb_can_collapse_to(msk, skb, mpext); + mptcp_skb_can_collapse_to(*write_seq, skb, mpext); if (!can_collapse) TCP_SKB_CB(skb)->eor = 1; else avail_size = size_goal - skb->len; } - psize = min_t(size_t, pfrag->size - pfrag->offset, avail_size); - /* Copy to page */ - pr_debug("left=%zu", msg_data_left(msg)); - psize = copy_page_from_iter(pfrag->page, pfrag->offset, - min_t(size_t, msg_data_left(msg), psize), - &msg->msg_iter); - pr_debug("left=%zu", msg_data_left(msg)); - if (!psize) - return -EINVAL; + if (!retransmission) { + /* reuse tail pfrag, if possible, or carve a new one from the + * page allocator + */ + dfrag = mptcp_rtx_tail(sk); + offset = pfrag->offset; + dfrag_collapsed = mptcp_frag_can_collapse_to(msk, pfrag, dfrag); + if (!dfrag_collapsed) { + dfrag = mptcp_carve_data_frag(msk, pfrag, offset); + offset = dfrag->offset; + frag_truesize = dfrag->overhead; + } + psize = min_t(size_t, pfrag->size - offset, avail_size); + + /* Copy to page */ + pr_debug("left=%zu", msg_data_left(msg)); + psize = copy_page_from_iter(pfrag->page, offset, + min_t(size_t, msg_data_left(msg), + psize), + &msg->msg_iter); + pr_debug("left=%zu", msg_data_left(msg)); + if (!psize) + return -EINVAL; + + if (!sk_wmem_schedule(sk, psize + dfrag->overhead)) + return -ENOMEM; + } else { + offset = dfrag->offset; + psize = min_t(size_t, dfrag->data_len, avail_size); + } /* tell the TCP stack to delay the push so that we can safely * access the skb after the sendpages call */ - ret = do_tcp_sendpages(ssk, pfrag->page, pfrag->offset, psize, + ret = do_tcp_sendpages(ssk, page, offset, psize, msg->msg_flags | MSG_SENDPAGE_NOTLAST); if (ret <= 0) return ret; - if (unlikely(ret < psize)) - iov_iter_revert(&msg->msg_iter, psize - ret); + + frag_truesize += ret; + if (!retransmission) { + if (unlikely(ret < psize)) + iov_iter_revert(&msg->msg_iter, psize - ret); + + /* send successful, keep track of sent data for mptcp-level + * retransmission + */ + dfrag->data_len += ret; + if (!dfrag_collapsed) { + get_page(dfrag->page); + list_add_tail(&dfrag->list, &msk->rtx_queue); + sk_wmem_queued_add(sk, frag_truesize); + } else { + sk_wmem_queued_add(sk, ret); + } + + /* charge data on mptcp rtx queue to the master socket + * Note: we charge such data both to sk and ssk + */ + sk->sk_forward_alloc -= frag_truesize; + } /* if the tail skb extension is still the cached one, collapsing * really happened. Note: we can't check for 'same skb' as the sk_buff @@ -373,7 +567,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, msk->cached_ext = NULL; memset(mpext, 0, sizeof(*mpext)); - mpext->data_seq = msk->write_seq; + mpext->data_seq = *write_seq; mpext->subflow_seq = mptcp_subflow_ctx(ssk)->rel_write_seq; mpext->data_len = ret; mpext->use_map = 1; @@ -384,13 +578,51 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, mpext->dsn64); out: - pfrag->offset += ret; - msk->write_seq += ret; + if (!retransmission) + pfrag->offset += frag_truesize; + *write_seq += ret; mptcp_subflow_ctx(ssk)->rel_write_seq += ret; return ret; } +static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *backup = NULL; + + sock_owned_by_me((const struct sock *)msk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (!sk_stream_memory_free(ssk)) { + struct socket *sock = ssk->sk_socket; + + if (sock) { + clear_bit(MPTCP_SEND_SPACE, &msk->flags); + smp_mb__after_atomic(); + + /* enables sk->write_space() callbacks */ + set_bit(SOCK_NOSPACE, &sock->flags); + } + + return NULL; + } + + if (subflow->backup) { + if (!backup) + backup = ssk; + + continue; + } + + return ssk; + } + + return backup; +} + static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) { struct socket *sock; @@ -438,17 +670,29 @@ fallback: return ret >= 0 ? ret + copied : (copied ? copied : ret); } - ssk = mptcp_subflow_get(msk); - if (!ssk) { - release_sock(sk); - return -ENOTCONN; + mptcp_clean_una(sk); + + __mptcp_flush_join_list(msk); + ssk = mptcp_subflow_get_send(msk); + while (!sk_stream_memory_free(sk) || !ssk) { + ret = sk_stream_wait_memory(sk, &timeo); + if (ret) + goto out; + + mptcp_clean_una(sk); + + ssk = mptcp_subflow_get_send(msk); + if (list_empty(&msk->conn_list)) { + ret = -ENOTCONN; + goto out; + } } pr_debug("conn_list->subflow=%p", ssk); lock_sock(ssk); while (msg_data_left(msg)) { - ret = mptcp_sendmsg_frag(sk, ssk, msg, &timeo, &mss_now, + ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now, &size_goal); if (ret < 0) break; @@ -461,10 +705,15 @@ fallback: copied += ret; } + mptcp_set_timeout(sk, ssk); if (copied) { ret = copied; tcp_push(ssk, msg->msg_flags, mss_now, tcp_sk(ssk)->nonagle, size_goal); + + /* start the timer, if it's not pending */ + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); } ssk_check_wmem(msk, ssk); @@ -572,6 +821,7 @@ fallback: len = min_t(size_t, len, INT_MAX); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + __mptcp_flush_join_list(msk); while (len > (size_t)copied) { int bytes_read; @@ -651,6 +901,69 @@ out_err: return copied; } +static void mptcp_retransmit_handler(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (atomic64_read(&msk->snd_una) == msk->write_seq) { + mptcp_stop_timer(sk); + } else { + set_bit(MPTCP_WORK_RTX, &msk->flags); + if (schedule_work(&msk->work)) + sock_hold(sk); + } +} + +static void mptcp_retransmit_timer(struct timer_list *t) +{ + struct inet_connection_sock *icsk = from_timer(icsk, t, + icsk_retransmit_timer); + struct sock *sk = &icsk->icsk_inet.sk; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + mptcp_retransmit_handler(sk); + } else { + /* delegate our work to tcp_release_cb() */ + if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, + &sk->sk_tsq_flags)) + sock_hold(sk); + } + bh_unlock_sock(sk); + sock_put(sk); +} + +/* Find an idle subflow. Return NULL if there is unacked data at tcp + * level. + * + * A backup subflow is returned only if that is the only kind available. + */ +static struct sock *mptcp_subflow_get_retrans(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct sock *backup = NULL; + + sock_owned_by_me((const struct sock *)msk); + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + /* still data outstanding at TCP level? Don't retransmit. */ + if (!tcp_write_queue_empty(ssk)) + return NULL; + + if (subflow->backup) { + if (!backup) + backup = ssk; + continue; + } + + return ssk; + } + + return backup; +} + /* subflow sockets can be either outgoing (connect) or incoming * (accept). * @@ -684,10 +997,63 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu) static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); - struct sock *sk = &msk->sk.icsk_inet.sk; + struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; + int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0; + struct mptcp_data_frag *dfrag; + u64 orig_write_seq; + size_t copied = 0; + struct msghdr msg; + long timeo = 0; lock_sock(sk); + mptcp_clean_una(sk); + __mptcp_flush_join_list(msk); __mptcp_move_skbs(msk); + + if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags)) + goto unlock; + + dfrag = mptcp_rtx_head(sk); + if (!dfrag) + goto unlock; + + ssk = mptcp_subflow_get_retrans(msk); + if (!ssk) + goto reset_unlock; + + lock_sock(ssk); + + msg.msg_flags = MSG_DONTWAIT; + orig_len = dfrag->data_len; + orig_offset = dfrag->offset; + orig_write_seq = dfrag->data_seq; + while (dfrag->data_len > 0) { + ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, &mss_now, + &size_goal); + if (ret < 0) + break; + + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); + copied += ret; + dfrag->data_len -= ret; + dfrag->offset += ret; + } + if (copied) + tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle, + size_goal); + + dfrag->data_seq = orig_write_seq; + dfrag->offset = orig_offset; + dfrag->data_len = orig_len; + + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + +reset_unlock: + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + +unlock: release_sock(sk); sock_put(sk); } @@ -696,22 +1062,55 @@ static int __mptcp_init_sock(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); + spin_lock_init(&msk->join_list_lock); + INIT_LIST_HEAD(&msk->conn_list); + INIT_LIST_HEAD(&msk->join_list); + INIT_LIST_HEAD(&msk->rtx_queue); __set_bit(MPTCP_SEND_SPACE, &msk->flags); INIT_WORK(&msk->work, mptcp_worker); msk->first = NULL; inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; + mptcp_pm_data_init(msk); + + /* re-use the csk retrans timer for MPTCP-level retrans */ + timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); + return 0; } static int mptcp_init_sock(struct sock *sk) { - if (!mptcp_is_enabled(sock_net(sk))) + struct net *net = sock_net(sk); + int ret; + + if (!mptcp_is_enabled(net)) return -ENOPROTOOPT; - return __mptcp_init_sock(sk); + if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) + return -ENOMEM; + + ret = __mptcp_init_sock(sk); + if (ret) + return ret; + + sk_sockets_allocated_inc(sk); + sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[2]; + + return 0; +} + +static void __mptcp_clear_xmit(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_data_frag *dtmp, *dfrag; + + sk_stop_timer(sk, &msk->sk.icsk_retransmit_timer); + + list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) + dfrag_clear(sk, dfrag); } static void mptcp_cancel_work(struct sock *sk) @@ -767,10 +1166,14 @@ static void mptcp_close(struct sock *sk, long timeout) mptcp_token_destroy(msk->token); inet_sk_state_store(sk, TCP_CLOSE); + __mptcp_flush_join_list(msk); + list_splice_init(&msk->conn_list, &conn_list); data_fin_tx_seq = msk->write_seq; + __mptcp_clear_xmit(sk); + release_sock(sk); list_for_each_entry_safe(subflow, tmp, &conn_list, node) { @@ -782,6 +1185,7 @@ static void mptcp_close(struct sock *sk, long timeout) } mptcp_cancel_work(sk); + mptcp_pm_close(msk); __skb_queue_purge(&sk->sk_receive_queue); @@ -811,6 +1215,15 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; } +static int mptcp_disconnect(struct sock *sk, int flags) +{ + lock_sock(sk); + __mptcp_clear_xmit(sk); + release_sock(sk); + mptcp_cancel_work(sk); + return tcp_disconnect(sk, flags); +} + #if IS_ENABLED(CONFIG_MPTCP_IPV6) static struct ipv6_pinfo *mptcp_inet6_sk(const struct sock *sk) { @@ -854,6 +1267,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk, struct request_sock *req) } msk->write_seq = subflow_req->idsn + 1; + atomic64_set(&msk->snd_una, msk->write_seq); if (subflow_req->remote_key_valid) { msk->can_ack = true; msk->remote_key = subflow_req->remote_key; @@ -920,7 +1334,12 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, list_add(&subflow->node, &msk->conn_list); bh_unlock_sock(new_mptcp_sock); + + __MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEPASSIVEACK); local_bh_enable(); + } else { + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK); } return newsk; @@ -932,6 +1351,8 @@ static void mptcp_destroy(struct sock *sk) if (msk->cached_ext) __skb_ext_put(msk->cached_ext); + + sk_sockets_allocated_dec(sk); } static int mptcp_setsockopt(struct sock *sk, int level, int optname, @@ -984,7 +1405,8 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, return -EOPNOTSUPP; } -#define MPTCP_DEFERRED_ALL TCPF_DELACK_TIMER_DEFERRED +#define MPTCP_DEFERRED_ALL (TCPF_DELACK_TIMER_DEFERRED | \ + TCPF_WRITE_TIMER_DEFERRED) /* this is very alike tcp_release_cb() but we must handle differently a * different set of events @@ -1000,6 +1422,8 @@ static void mptcp_release_cb(struct sock *sk) nflags = flags & ~MPTCP_DEFERRED_ALL; } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags); + sock_release_ownership(sk); + if (flags & TCPF_DELACK_TIMER_DEFERRED) { struct mptcp_sock *msk = mptcp_sk(sk); struct sock *ssk; @@ -1008,6 +1432,11 @@ static void mptcp_release_cb(struct sock *sk) if (!ssk || !schedule_work(&msk->work)) __sock_put(sk); } + + if (flags & TCPF_WRITE_TIMER_DEFERRED) { + mptcp_retransmit_handler(sk); + __sock_put(sk); + } } static int mptcp_get_port(struct sock *sk, unsigned short snum) @@ -1031,13 +1460,15 @@ void mptcp_finish_connect(struct sock *ssk) u64 ack_seq; subflow = mptcp_subflow_ctx(ssk); - - if (!subflow->mp_capable) - return; - sk = subflow->conn; msk = mptcp_sk(sk); + if (!subflow->mp_capable) { + MPTCP_INC_STATS(sock_net(sk), + MPTCP_MIB_MPCAPABLEACTIVEFALLBACK); + return; + } + pr_debug("msk=%p, token=%u", sk, subflow->token); mptcp_crypto_key_sha(subflow->remote_key, NULL, &ack_seq); @@ -1055,6 +1486,9 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->ack_seq, ack_seq); WRITE_ONCE(msk->can_ack, 1); + atomic64_set(&msk->snd_una, msk->write_seq); + + mptcp_pm_new_connection(msk, 0); } static void mptcp_sock_graft(struct sock *sk, struct socket *parent) @@ -1066,6 +1500,46 @@ static void mptcp_sock_graft(struct sock *sk, struct socket *parent) write_unlock_bh(&sk->sk_callback_lock); } +bool mptcp_finish_join(struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + struct mptcp_sock *msk = mptcp_sk(subflow->conn); + struct sock *parent = (void *)msk; + struct socket *parent_sock; + bool ret; + + pr_debug("msk=%p, subflow=%p", msk, subflow); + + /* mptcp socket already closing? */ + if (inet_sk_state_load(parent) != TCP_ESTABLISHED) + return false; + + if (!msk->pm.server_side) + return true; + + /* passive connection, attach to msk socket */ + parent_sock = READ_ONCE(parent->sk_socket); + if (parent_sock && !sk->sk_socket) + mptcp_sock_graft(sk, parent_sock); + + ret = mptcp_pm_allow_new_subflow(msk); + if (ret) { + /* active connections are already on conn_list */ + spin_lock_bh(&msk->join_list_lock); + if (!WARN_ON_ONCE(!list_empty(&subflow->node))) + list_add_tail(&subflow->node, &msk->join_list); + spin_unlock_bh(&msk->join_list_lock); + } + return ret; +} + +bool mptcp_sk_is_subflow(const struct sock *sk) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); + + return subflow->mp_join == 1; +} + static bool mptcp_memory_free(const struct sock *sk, int wake) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1077,6 +1551,7 @@ static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, .init = mptcp_init_sock, + .disconnect = mptcp_disconnect, .close = mptcp_close, .accept = mptcp_accept, .setsockopt = mptcp_setsockopt, @@ -1089,7 +1564,12 @@ static struct proto mptcp_prot = { .hash = inet_hash, .unhash = inet_unhash, .get_port = mptcp_get_port, + .sockets_allocated = &mptcp_sockets_allocated, + .memory_allocated = &tcp_memory_allocated, + .memory_pressure = &tcp_memory_pressure, .stream_memory_free = mptcp_memory_free, + .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), .no_autobind = true, }; @@ -1245,6 +1725,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, /* set ssk->sk_socket of accept()ed flows to mptcp socket. * This is needed so NOSPACE flag can be set from tcp stack. */ + __mptcp_flush_join_list(msk); list_for_each_entry(subflow, &msk->conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); @@ -1326,6 +1807,7 @@ static int mptcp_shutdown(struct socket *sock, int how) sock->state = SS_CONNECTED; } + __mptcp_flush_join_list(msk); mptcp_for_each_subflow(msk, subflow) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); @@ -1376,7 +1858,11 @@ void mptcp_proto_init(void) { mptcp_prot.h.hashinfo = tcp_prot.h.hashinfo; + if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL)) + panic("Failed to allocate MPTCP pcpu counter\n"); + mptcp_subflow_init(); + mptcp_pm_init(); if (proto_register(&mptcp_prot, 1) != 0) panic("Failed to register MPTCP proto.\n"); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index eb3f65264a40..f733c5425552 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -17,6 +17,12 @@ #define OPTION_MPTCP_MPC_SYN BIT(0) #define OPTION_MPTCP_MPC_SYNACK BIT(1) #define OPTION_MPTCP_MPC_ACK BIT(2) +#define OPTION_MPTCP_MPJ_SYN BIT(3) +#define OPTION_MPTCP_MPJ_SYNACK BIT(4) +#define OPTION_MPTCP_MPJ_ACK BIT(5) +#define OPTION_MPTCP_ADD_ADDR BIT(6) +#define OPTION_MPTCP_ADD_ADDR6 BIT(7) +#define OPTION_MPTCP_RM_ADDR BIT(8) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -33,12 +39,30 @@ #define TCPOLEN_MPTCP_MPC_SYNACK 12 #define TCPOLEN_MPTCP_MPC_ACK 20 #define TCPOLEN_MPTCP_MPC_ACK_DATA 22 +#define TCPOLEN_MPTCP_MPJ_SYN 12 +#define TCPOLEN_MPTCP_MPJ_SYNACK 16 +#define TCPOLEN_MPTCP_MPJ_ACK 24 #define TCPOLEN_MPTCP_DSS_BASE 4 #define TCPOLEN_MPTCP_DSS_ACK32 4 #define TCPOLEN_MPTCP_DSS_ACK64 8 #define TCPOLEN_MPTCP_DSS_MAP32 10 #define TCPOLEN_MPTCP_DSS_MAP64 14 #define TCPOLEN_MPTCP_DSS_CHECKSUM 2 +#define TCPOLEN_MPTCP_ADD_ADDR 16 +#define TCPOLEN_MPTCP_ADD_ADDR_PORT 18 +#define TCPOLEN_MPTCP_ADD_ADDR_BASE 8 +#define TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT 10 +#define TCPOLEN_MPTCP_ADD_ADDR6 28 +#define TCPOLEN_MPTCP_ADD_ADDR6_PORT 30 +#define TCPOLEN_MPTCP_ADD_ADDR6_BASE 20 +#define TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT 22 +#define TCPOLEN_MPTCP_PORT_LEN 2 +#define TCPOLEN_MPTCP_RM_ADDR_BASE 4 + +/* MPTCP MP_JOIN flags */ +#define MPTCPOPT_BACKUP BIT(0) +#define MPTCPOPT_HMAC_LEN 20 +#define MPTCPOPT_THMAC_LEN 8 /* MPTCP MP_CAPABLE flags */ #define MPTCP_VERSION_MASK (0x0F) @@ -55,9 +79,75 @@ #define MPTCP_DSS_HAS_ACK BIT(0) #define MPTCP_DSS_FLAG_MASK (0x1F) +/* MPTCP ADD_ADDR flags */ +#define MPTCP_ADDR_ECHO BIT(0) +#define MPTCP_ADDR_HMAC_LEN 20 +#define MPTCP_ADDR_IPVERSION_4 4 +#define MPTCP_ADDR_IPVERSION_6 6 + /* MPTCP socket flags */ #define MPTCP_DATA_READY 0 #define MPTCP_SEND_SPACE 1 +#define MPTCP_WORK_RTX 2 + +static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field) +{ + return htonl((TCPOPT_MPTCP << 24) | (len << 16) | (subopt << 12) | + ((nib & 0xF) << 8) | field); +} + +#define MPTCP_PM_MAX_ADDR 4 + +struct mptcp_addr_info { + sa_family_t family; + __be16 port; + u8 id; + union { + struct in_addr addr; +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + struct in6_addr addr6; +#endif + }; +}; + +enum mptcp_pm_status { + MPTCP_PM_ADD_ADDR_RECEIVED, + MPTCP_PM_ESTABLISHED, + MPTCP_PM_SUBFLOW_ESTABLISHED, +}; + +struct mptcp_pm_data { + struct mptcp_addr_info local; + struct mptcp_addr_info remote; + + spinlock_t lock; /*protects the whole PM data */ + + bool addr_signal; + bool server_side; + bool work_pending; + bool accept_addr; + bool accept_subflow; + u8 add_addr_signaled; + u8 add_addr_accepted; + u8 local_addr_used; + u8 subflows; + u8 add_addr_signal_max; + u8 add_addr_accept_max; + u8 local_addr_max; + u8 subflows_max; + u8 status; + + struct work_struct work; +}; + +struct mptcp_data_frag { + struct list_head list; + u64 data_seq; + int data_len; + int offset; + int overhead; + struct page *page; +}; /* MPTCP connection sock */ struct mptcp_sock { @@ -67,14 +157,20 @@ struct mptcp_sock { u64 remote_key; u64 write_seq; u64 ack_seq; + atomic64_t snd_una; + unsigned long timer_ival; u32 token; unsigned long flags; bool can_ack; + spinlock_t join_list_lock; struct work_struct work; struct list_head conn_list; + struct list_head rtx_queue; + struct list_head join_list; struct skb_ext *cached_ext; /* for the next sendmsg */ struct socket *subflow; /* outgoing connect/listener/!mp_capable */ struct sock *first; + struct mptcp_pm_data pm; }; #define mptcp_for_each_subflow(__msk, __subflow) \ @@ -85,17 +181,42 @@ static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) return (struct mptcp_sock *)sk; } +static inline struct mptcp_data_frag *mptcp_rtx_tail(const struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (list_empty(&msk->rtx_queue)) + return NULL; + + return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); +} + +static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + if (list_empty(&msk->rtx_queue)) + return NULL; + + return list_first_entry(&msk->rtx_queue, struct mptcp_data_frag, list); +} + struct mptcp_subflow_request_sock { struct tcp_request_sock sk; u16 mp_capable : 1, mp_join : 1, backup : 1, remote_key_valid : 1; + u8 local_id; + u8 remote_id; u64 local_key; u64 remote_key; u64 idsn; u32 token; u32 ssn_offset; + u64 thmac; + u32 local_nonce; + u32 remote_nonce; }; static inline struct mptcp_subflow_request_sock * @@ -118,16 +239,28 @@ struct mptcp_subflow_context { u32 ssn_offset; u32 map_data_len; u32 request_mptcp : 1, /* send MP_CAPABLE */ + request_join : 1, /* send MP_JOIN */ + request_bkup : 1, mp_capable : 1, /* remote is MPTCP capable */ + mp_join : 1, /* remote is JOINing */ fully_established : 1, /* path validated */ + pm_notified : 1, /* PM hook called for established status */ conn_finished : 1, map_valid : 1, mpc_map : 1, + backup : 1, data_avail : 1, rx_eof : 1, data_fin_tx_enable : 1, can_ack : 1; /* only after processing the remote a key */ u64 data_fin_tx_seq; + u32 remote_nonce; + u64 thmac; + u32 local_nonce; + u32 remote_token; + u8 hmac[MPTCPOPT_HMAC_LEN]; + u8 local_id; + u8 remote_id; struct sock *tcp_sock; /* tcp sk backpointer */ struct sock *conn; /* parent mptcp_sock */ @@ -171,6 +304,11 @@ mptcp_subflow_get_mapped_dsn(const struct mptcp_subflow_context *subflow) int mptcp_is_enabled(struct net *net); bool mptcp_subflow_data_available(struct sock *sk); void mptcp_subflow_init(void); + +/* called with sk socket lock held */ +int __mptcp_subflow_connect(struct sock *sk, int ifindex, + const struct mptcp_addr_info *loc, + const struct mptcp_addr_info *remote); int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock); static inline void mptcp_subflow_tcp_fallback(struct sock *sk, @@ -199,11 +337,14 @@ void mptcp_get_options(const struct sk_buff *skb, void mptcp_finish_connect(struct sock *sk); void mptcp_data_ready(struct sock *sk, struct sock *ssk); +bool mptcp_finish_join(struct sock *sk); +void mptcp_data_acked(struct sock *sk); int mptcp_token_new_request(struct request_sock *req); void mptcp_token_destroy_request(u32 token); int mptcp_token_new_connect(struct sock *sk); int mptcp_token_new_accept(u32 token, struct sock *conn); +struct mptcp_sock *mptcp_token_get_sock(u32 token); void mptcp_token_destroy(u32 token); void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn); @@ -219,8 +360,48 @@ static inline void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) mptcp_crypto_key_sha(*key, token, idsn); } -void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u32 nonce1, u32 nonce2, - void *hash_out); +void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac); + +void mptcp_pm_init(void); +void mptcp_pm_data_init(struct mptcp_sock *msk); +void mptcp_pm_close(struct mptcp_sock *msk); +void mptcp_pm_new_connection(struct mptcp_sock *msk, int server_side); +void mptcp_pm_fully_established(struct mptcp_sock *msk); +bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); +void mptcp_pm_connection_closed(struct mptcp_sock *msk); +void mptcp_pm_subflow_established(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow); +void mptcp_pm_subflow_closed(struct mptcp_sock *msk, u8 id); +void mptcp_pm_add_addr_received(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr); + +int mptcp_pm_announce_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr); +int mptcp_pm_remove_addr(struct mptcp_sock *msk, u8 local_id); +int mptcp_pm_remove_subflow(struct mptcp_sock *msk, u8 remote_id); + +static inline bool mptcp_pm_should_signal(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.addr_signal); +} + +static inline unsigned int mptcp_add_addr_len(int family) +{ + if (family == AF_INET) + return TCPOLEN_MPTCP_ADD_ADDR; + return TCPOLEN_MPTCP_ADD_ADDR6; +} + +bool mptcp_pm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, + struct mptcp_addr_info *saddr); +int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); + +void mptcp_pm_nl_init(void); +void mptcp_pm_nl_data_init(struct mptcp_sock *msk); +void mptcp_pm_nl_fully_established(struct mptcp_sock *msk); +void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk); +void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk); +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); static inline struct mptcp_ext *mptcp_get_ext(struct sk_buff *skb) { @@ -234,4 +415,6 @@ static inline bool before64(__u64 seq1, __u64 seq2) #define after64(seq2, seq1) before64(seq1, seq2) +void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops); + #endif /* __MPTCP_PROTOCOL_H */ diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 5bae12da2769..b5180c81588e 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -19,17 +20,42 @@ #endif #include #include "protocol.h" +#include "mib.h" + +static void SUBFLOW_REQ_INC_STATS(struct request_sock *req, + enum linux_mptcp_mib_field field) +{ + MPTCP_INC_STATS(sock_net(req_to_sk(req)), field); +} static int subflow_rebuild_header(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - int err = 0; + int local_id, err = 0; if (subflow->request_mptcp && !subflow->token) { pr_debug("subflow=%p", sk); err = mptcp_token_new_connect(sk); + } else if (subflow->request_join && !subflow->local_nonce) { + struct mptcp_sock *msk = (struct mptcp_sock *)subflow->conn; + + pr_debug("subflow=%p", sk); + + do { + get_random_bytes(&subflow->local_nonce, sizeof(u32)); + } while (!subflow->local_nonce); + + if (subflow->local_id) + goto out; + + local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)sk); + if (local_id < 0) + return -EINVAL; + + subflow->local_id = local_id; } +out: if (err) return err; @@ -47,6 +73,51 @@ static void subflow_req_destructor(struct request_sock *req) tcp_request_sock_ops.destructor(req); } +static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2, + void *hmac) +{ + u8 msg[8]; + + put_unaligned_be32(nonce1, &msg[0]); + put_unaligned_be32(nonce2, &msg[4]); + + mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac); +} + +/* validate received token and create truncated hmac and nonce for SYN-ACK */ +static bool subflow_token_join_request(struct request_sock *req, + const struct sk_buff *skb) +{ + struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); + u8 hmac[MPTCPOPT_HMAC_LEN]; + struct mptcp_sock *msk; + int local_id; + + msk = mptcp_token_get_sock(subflow_req->token); + if (!msk) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN); + return false; + } + + local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req); + if (local_id < 0) { + sock_put((struct sock *)msk); + return false; + } + subflow_req->local_id = local_id; + + get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); + + subflow_generate_hmac(msk->local_key, msk->remote_key, + subflow_req->local_nonce, + subflow_req->remote_nonce, hmac); + + subflow_req->thmac = get_unaligned_be64(hmac); + + sock_put((struct sock *)msk); + return true; +} + static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) @@ -61,6 +132,7 @@ static void subflow_init_req(struct request_sock *req, mptcp_get_options(skb, &rx_opt); subflow_req->mp_capable = 0; + subflow_req->mp_join = 0; subflow_req->remote_key_valid = 0; #ifdef CONFIG_TCP_MD5SIG @@ -71,6 +143,15 @@ static void subflow_init_req(struct request_sock *req, return; #endif + if (rx_opt.mptcp.mp_capable) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); + + if (rx_opt.mptcp.mp_join) + return; + } else if (rx_opt.mptcp.mp_join) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX); + } + if (rx_opt.mptcp.mp_capable && listener->request_mptcp) { int err; @@ -79,6 +160,19 @@ static void subflow_init_req(struct request_sock *req, subflow_req->mp_capable = 1; subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; + } else if (rx_opt.mptcp.mp_join && listener->request_mptcp) { + subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq; + subflow_req->mp_join = 1; + subflow_req->backup = rx_opt.mptcp.backup; + subflow_req->remote_id = rx_opt.mptcp.join_id; + subflow_req->token = rx_opt.mptcp.token; + subflow_req->remote_nonce = rx_opt.mptcp.nonce; + pr_debug("token=%u, remote_nonce=%u", subflow_req->token, + subflow_req->remote_nonce); + if (!subflow_token_join_request(req, skb)) { + subflow_req->mp_join = 0; + // @@ need to trigger RST + } } } @@ -106,6 +200,25 @@ static void subflow_v6_init_req(struct request_sock *req, } #endif +/* validate received truncated hmac and create hmac for third ACK */ +static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) +{ + u8 hmac[MPTCPOPT_HMAC_LEN]; + u64 thmac; + + subflow_generate_hmac(subflow->remote_key, subflow->local_key, + subflow->remote_nonce, subflow->local_nonce, + hmac); + + thmac = get_unaligned_be64(hmac); + pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n", + subflow, subflow->token, + (unsigned long long)thmac, + (unsigned long long)subflow->thmac); + + return thmac == subflow->thmac; +} + static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -118,7 +231,10 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) parent->sk_state_change(parent); } - if (!subflow->conn_finished) { + if (subflow->conn_finished || !tcp_sk(sk)->is_mptcp) + return; + + if (subflow->mp_capable) { pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk), subflow->remote_key); mptcp_finish_connect(sk); @@ -128,6 +244,33 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq); subflow->ssn_offset = TCP_SKB_CB(skb)->seq; } + } else if (subflow->mp_join) { + pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", + subflow, subflow->thmac, + subflow->remote_nonce); + if (!subflow_thmac_valid(subflow)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC); + subflow->mp_join = 0; + goto do_reset; + } + + subflow_generate_hmac(subflow->local_key, subflow->remote_key, + subflow->local_nonce, + subflow->remote_nonce, + subflow->hmac); + + if (skb) + subflow->ssn_offset = TCP_SKB_CB(skb)->seq; + + if (!mptcp_finish_join(sk)) + goto do_reset; + + subflow->conn_finished = 1; + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX); + } else { +do_reset: + tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_done(sk); } } @@ -178,6 +321,32 @@ drop: } #endif +/* validate hmac received in third ACK */ +static bool subflow_hmac_valid(const struct request_sock *req, + const struct tcp_options_received *rx_opt) +{ + const struct mptcp_subflow_request_sock *subflow_req; + u8 hmac[MPTCPOPT_HMAC_LEN]; + struct mptcp_sock *msk; + bool ret; + + subflow_req = mptcp_subflow_rsk(req); + msk = mptcp_token_get_sock(subflow_req->token); + if (!msk) + return false; + + subflow_generate_hmac(msk->remote_key, msk->local_key, + subflow_req->remote_nonce, + subflow_req->local_nonce, hmac); + + ret = true; + if (crypto_memneq(hmac, rx_opt->mptcp.hmac, sizeof(hmac))) + ret = false; + + sock_put((struct sock *)msk); + return ret; +} + static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -188,6 +357,7 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk); struct mptcp_subflow_request_sock *subflow_req; struct tcp_options_received opt_rx; + bool fallback_is_fatal = false; struct sock *new_msk = NULL; struct sock *child; @@ -221,6 +391,15 @@ create_msk: new_msk = mptcp_sk_clone(listener->conn, req); if (!new_msk) subflow_req->mp_capable = 0; + } else if (subflow_req->mp_join) { + fallback_is_fatal = true; + opt_rx.mptcp.mp_join = 0; + mptcp_get_options(skb, &opt_rx); + if (!opt_rx.mptcp.mp_join || + !subflow_hmac_valid(req, &opt_rx)) { + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC); + return NULL; + } } create_child: @@ -230,20 +409,35 @@ create_child: if (child && *own_req) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); - /* we have null ctx on TCP fallback, not fatal on MPC - * handshake + /* we have null ctx on TCP fallback, which is fatal on + * MPJ handshake */ - if (!ctx) + if (!ctx) { + if (fallback_is_fatal) + goto close_child; goto out; + } if (ctx->mp_capable) { /* new mpc subflow takes ownership of the newly * created mptcp socket */ - inet_sk_state_store((struct sock *)new_msk, - TCP_ESTABLISHED); + inet_sk_state_store(new_msk, TCP_ESTABLISHED); + mptcp_pm_new_connection(mptcp_sk(new_msk), 1); ctx->conn = new_msk; new_msk = NULL; + } else if (ctx->mp_join) { + struct mptcp_sock *owner; + + owner = mptcp_token_get_sock(ctx->token); + if (!owner) + goto close_child; + + ctx->conn = (struct sock *)owner; + if (!mptcp_finish_join(child)) + goto close_child; + + SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); } } @@ -252,6 +446,12 @@ out: if (unlikely(new_msk)) sock_put(new_msk); return child; + +close_child: + tcp_send_active_reset(child, GFP_ATOMIC); + inet_csk_prepare_forced_close(child); + tcp_done(child); + return NULL; } static struct inet_connection_sock_af_ops subflow_specific; @@ -353,6 +553,7 @@ static enum mapping_status get_mapping_status(struct sock *ssk) data_len = mpext->data_len; if (data_len == 0) { pr_err("Infinite mapping not handled"); + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX); return MAPPING_INVALID; } @@ -396,8 +597,10 @@ static enum mapping_status get_mapping_status(struct sock *ssk) /* If this skb data are fully covered by the current mapping, * the new map would need caching, which is not supported */ - if (skb_is_fully_mapped(ssk, skb)) + if (skb_is_fully_mapped(ssk, skb)) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH); return MAPPING_INVALID; + } /* will validate the next map after consuming the current one */ return MAPPING_OK; @@ -566,7 +769,7 @@ static void subflow_data_ready(struct sock *sk) struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); struct sock *parent = subflow->conn; - if (!subflow->mp_capable) { + if (!subflow->mp_capable && !subflow->mp_join) { subflow->tcp_data_ready(sk); parent->sk_data_ready(parent); @@ -621,6 +824,85 @@ void mptcpv6_handle_mapped(struct sock *sk, bool mapped) } #endif +static void mptcp_info2sockaddr(const struct mptcp_addr_info *info, + struct sockaddr_storage *addr) +{ + memset(addr, 0, sizeof(*addr)); + addr->ss_family = info->family; + if (addr->ss_family == AF_INET) { + struct sockaddr_in *in_addr = (struct sockaddr_in *)addr; + + in_addr->sin_addr = info->addr; + in_addr->sin_port = info->port; + } +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + else if (addr->ss_family == AF_INET6) { + struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr; + + in6_addr->sin6_addr = info->addr6; + in6_addr->sin6_port = info->port; + } +#endif +} + +int __mptcp_subflow_connect(struct sock *sk, int ifindex, + const struct mptcp_addr_info *loc, + const struct mptcp_addr_info *remote) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct mptcp_subflow_context *subflow; + struct sockaddr_storage addr; + struct socket *sf; + u32 remote_token; + int addrlen; + int err; + + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + err = mptcp_subflow_create_socket(sk, &sf); + if (err) + return err; + + subflow = mptcp_subflow_ctx(sf->sk); + subflow->remote_key = msk->remote_key; + subflow->local_key = msk->local_key; + subflow->token = msk->token; + mptcp_info2sockaddr(loc, &addr); + + addrlen = sizeof(struct sockaddr_in); +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + if (loc->family == AF_INET6) + addrlen = sizeof(struct sockaddr_in6); +#endif + sf->sk->sk_bound_dev_if = ifindex; + err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); + if (err) + goto failed; + + mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL); + pr_debug("msk=%p remote_token=%u", msk, remote_token); + subflow->remote_token = remote_token; + subflow->local_id = loc->id; + subflow->request_join = 1; + subflow->request_bkup = 1; + mptcp_info2sockaddr(remote, &addr); + + err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); + if (err && err != -EINPROGRESS) + goto failed; + + spin_lock_bh(&msk->join_list_lock); + list_add_tail(&subflow->node, &msk->join_list); + spin_unlock_bh(&msk->join_list_lock); + + return err; + +failed: + sock_release(sf); + return err; +} + int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) { struct mptcp_subflow_context *subflow; @@ -785,7 +1067,8 @@ static void subflow_ulp_clone(const struct request_sock *req, struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); struct mptcp_subflow_context *new_ctx; - if (!tcp_rsk(req)->is_mptcp || !subflow_req->mp_capable) { + if (!tcp_rsk(req)->is_mptcp || + (!subflow_req->mp_capable && !subflow_req->mp_join)) { subflow_ulp_fallback(newsk, old_ctx); return; } @@ -796,9 +1079,6 @@ static void subflow_ulp_clone(const struct request_sock *req, return; } - /* see comments in subflow_syn_recv_sock(), MPTCP connection is fully - * established only after we receive the remote key - */ new_ctx->conn_finished = 1; new_ctx->icsk_af_ops = old_ctx->icsk_af_ops; new_ctx->tcp_data_ready = old_ctx->tcp_data_ready; @@ -807,14 +1087,27 @@ static void subflow_ulp_clone(const struct request_sock *req, new_ctx->rel_write_seq = 1; new_ctx->tcp_sock = newsk; - new_ctx->mp_capable = 1; - new_ctx->fully_established = subflow_req->remote_key_valid; - new_ctx->can_ack = subflow_req->remote_key_valid; - new_ctx->remote_key = subflow_req->remote_key; - new_ctx->local_key = subflow_req->local_key; - new_ctx->token = subflow_req->token; - new_ctx->ssn_offset = subflow_req->ssn_offset; - new_ctx->idsn = subflow_req->idsn; + if (subflow_req->mp_capable) { + /* see comments in subflow_syn_recv_sock(), MPTCP connection + * is fully established only after we receive the remote key + */ + new_ctx->mp_capable = 1; + new_ctx->fully_established = subflow_req->remote_key_valid; + new_ctx->can_ack = subflow_req->remote_key_valid; + new_ctx->remote_key = subflow_req->remote_key; + new_ctx->local_key = subflow_req->local_key; + new_ctx->token = subflow_req->token; + new_ctx->ssn_offset = subflow_req->ssn_offset; + new_ctx->idsn = subflow_req->idsn; + } else if (subflow_req->mp_join) { + new_ctx->ssn_offset = subflow_req->ssn_offset; + new_ctx->mp_join = 1; + new_ctx->fully_established = 1; + new_ctx->backup = subflow_req->backup; + new_ctx->local_id = subflow_req->local_id; + new_ctx->token = subflow_req->token; + new_ctx->thmac = subflow_req->thmac; + } } static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = { @@ -876,6 +1169,8 @@ void mptcp_subflow_init(void) subflow_v6m_specific.net_frag_header_len = 0; #endif + mptcp_diag_subflow_init(&subflow_ulp_ops); + if (tcp_register_ulp(&subflow_ulp_ops) != 0) panic("MPTCP: failed to register subflows to ULP\n"); } diff --git a/net/mptcp/token.c b/net/mptcp/token.c index b71b53c0ac8d..129a5ad1bc35 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -140,6 +140,33 @@ int mptcp_token_new_accept(u32 token, struct sock *conn) return err; } +/** + * mptcp_token_get_sock - retrieve mptcp connection sock using its token + * @token: token of the mptcp connection to retrieve + * + * This function returns the mptcp connection structure with the given token. + * A reference count on the mptcp socket returned is taken. + * + * returns NULL if no connection with the given token value exists. + */ +struct mptcp_sock *mptcp_token_get_sock(u32 token) +{ + struct sock *conn; + + spin_lock_bh(&token_tree_lock); + conn = radix_tree_lookup(&token_tree, token); + if (conn) { + /* token still reserved? */ + if (conn == (struct sock *)&token_used) + conn = NULL; + else + sock_hold(conn); + } + spin_unlock_bh(&token_tree_lock); + + return mptcp_sk(conn); +} + /** * mptcp_token_destroy_request - remove mptcp connection/token * @token - token of mptcp connection to remove diff --git a/tools/testing/selftests/net/mptcp/.gitignore b/tools/testing/selftests/net/mptcp/.gitignore index d72f07642738..ea13b255a99d 100644 --- a/tools/testing/selftests/net/mptcp/.gitignore +++ b/tools/testing/selftests/net/mptcp/.gitignore @@ -1,2 +1,3 @@ mptcp_connect +pm_nl_ctl *.pcap diff --git a/tools/testing/selftests/net/mptcp/Makefile b/tools/testing/selftests/net/mptcp/Makefile index ba450e62dc5b..f50976ee7d44 100644 --- a/tools/testing/selftests/net/mptcp/Makefile +++ b/tools/testing/selftests/net/mptcp/Makefile @@ -1,12 +1,13 @@ # SPDX-License-Identifier: GPL-2.0 top_srcdir = ../../../../.. +KSFT_KHDR_INSTALL := 1 -CFLAGS = -Wall -Wl,--no-as-needed -O2 -g +CFLAGS = -Wall -Wl,--no-as-needed -O2 -g -I$(top_srcdir)/usr/include -TEST_PROGS := mptcp_connect.sh +TEST_PROGS := mptcp_connect.sh pm_netlink.sh mptcp_join.sh -TEST_GEN_FILES = mptcp_connect +TEST_GEN_FILES = mptcp_connect pm_nl_ctl TEST_FILES := settings diff --git a/tools/testing/selftests/net/mptcp/mptcp_connect.c b/tools/testing/selftests/net/mptcp/mptcp_connect.c index 702bab2c12da..cedee5b952ba 100644 --- a/tools/testing/selftests/net/mptcp/mptcp_connect.c +++ b/tools/testing/selftests/net/mptcp/mptcp_connect.c @@ -51,6 +51,7 @@ static bool tcpulp_audit; static int pf = AF_INET; static int cfg_sndbuf; static int cfg_rcvbuf; +static bool cfg_join; static void die_usage(void) { @@ -250,6 +251,7 @@ static int sock_connect_mptcp(const char * const remoteaddr, static size_t do_rnd_write(const int fd, char *buf, const size_t len) { + static bool first = true; unsigned int do_w; ssize_t bw; @@ -257,10 +259,19 @@ static size_t do_rnd_write(const int fd, char *buf, const size_t len) if (do_w == 0 || do_w > len) do_w = len; + if (cfg_join && first && do_w > 100) + do_w = 100; + bw = write(fd, buf, do_w); if (bw < 0) perror("write"); + /* let the join handshake complete, before going on */ + if (cfg_join && first) { + usleep(200000); + first = false; + } + return bw; } @@ -385,8 +396,11 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd) break; /* ... but we still receive. - * Close our write side. + * Close our write side, ev. give some time + * for address notification */ + if (cfg_join) + usleep(400000); shutdown(peerfd, SHUT_WR); } else { if (errno == EINTR) @@ -403,6 +417,10 @@ static int copyfd_io_poll(int infd, int peerfd, int outfd) } } + /* leave some time for late join/announce */ + if (cfg_join) + usleep(400000); + close(peerfd); return 0; } @@ -658,7 +676,7 @@ static void maybe_close(int fd) { unsigned int r = rand(); - if (r & 1) + if (!cfg_join && (r & 1)) close(fd); } @@ -794,8 +812,12 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "6lp:s:hut:m:S:R:")) != -1) { + while ((c = getopt(argc, argv, "6jlp:s:hut:m:S:R:")) != -1) { switch (c) { + case 'j': + cfg_join = true; + cfg_mode = CFG_MODE_POLL; + break; case 'l': listen_mode = true; break; diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh new file mode 100755 index 000000000000..dd42c2f692d0 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh @@ -0,0 +1,357 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ret=0 +sin="" +sout="" +cin="" +cout="" +ksft_skip=4 +timeout=30 +capture=0 + +TEST_COUNT=0 + +init() +{ + capout=$(mktemp) + + rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) + + ns1="ns1-$rndh" + ns2="ns2-$rndh" + + for netns in "$ns1" "$ns2";do + ip netns add $netns || exit $ksft_skip + ip -net $netns link set lo up + ip netns exec $netns sysctl -q net.mptcp.enabled=1 + ip netns exec $netns sysctl -q net.ipv4.conf.all.rp_filter=0 + ip netns exec $netns sysctl -q net.ipv4.conf.default.rp_filter=0 + done + + # ns1 ns2 + # ns1eth1 ns2eth1 + # ns1eth2 ns2eth2 + # ns1eth3 ns2eth3 + # ns1eth4 ns2eth4 + + for i in `seq 1 4`; do + ip link add ns1eth$i netns "$ns1" type veth peer name ns2eth$i netns "$ns2" + ip -net "$ns1" addr add 10.0.$i.1/24 dev ns1eth$i + ip -net "$ns1" addr add dead:beef:$i::1/64 dev ns1eth$i nodad + ip -net "$ns1" link set ns1eth$i up + + ip -net "$ns2" addr add 10.0.$i.2/24 dev ns2eth$i + ip -net "$ns2" addr add dead:beef:$i::2/64 dev ns2eth$i nodad + ip -net "$ns2" link set ns2eth$i up + + # let $ns2 reach any $ns1 address from any interface + ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i + done +} + +cleanup_partial() +{ + rm -f "$capout" + + for netns in "$ns1" "$ns2"; do + ip netns del $netns + done +} + +cleanup() +{ + rm -f "$cin" "$cout" + rm -f "$sin" "$sout" + cleanup_partial +} + +reset() +{ + cleanup_partial + init +} + +for arg in "$@"; do + if [ "$arg" = "-c" ]; then + capture=1 + fi +done + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + + +check_transfer() +{ + in=$1 + out=$2 + what=$3 + + cmp "$in" "$out" > /dev/null 2>&1 + if [ $? -ne 0 ] ;then + echo "[ FAIL ] $what does not match (in, out):" + print_file_err "$in" + print_file_err "$out" + + return 1 + fi + + return 0 +} + +do_ping() +{ + listener_ns="$1" + connector_ns="$2" + connect_addr="$3" + + ip netns exec ${connector_ns} ping -q -c 1 $connect_addr >/dev/null + if [ $? -ne 0 ] ; then + echo "$listener_ns -> $connect_addr connectivity [ FAIL ]" 1>&2 + ret=1 + fi +} + +do_transfer() +{ + listener_ns="$1" + connector_ns="$2" + cl_proto="$3" + srv_proto="$4" + connect_addr="$5" + + port=$((10000+$TEST_COUNT)) + TEST_COUNT=$((TEST_COUNT+1)) + + :> "$cout" + :> "$sout" + :> "$capout" + + if [ $capture -eq 1 ]; then + if [ -z $SUDO_USER ] ; then + capuser="" + else + capuser="-Z $SUDO_USER" + fi + + capfile="mp_join-${listener_ns}.pcap" + + echo "Capturing traffic for test $TEST_COUNT into $capfile" + ip netns exec ${listener_ns} tcpdump -i any -s 65535 -B 32768 $capuser -w $capfile > "$capout" 2>&1 & + cappid=$! + + sleep 1 + fi + + ip netns exec ${listener_ns} ./mptcp_connect -j -t $timeout -l -p $port -s ${srv_proto} 0.0.0.0 < "$sin" > "$sout" & + spid=$! + + sleep 1 + + ip netns exec ${connector_ns} ./mptcp_connect -j -t $timeout -p $port -s ${cl_proto} $connect_addr < "$cin" > "$cout" & + cpid=$! + + wait $cpid + retc=$? + wait $spid + rets=$? + + if [ $capture -eq 1 ]; then + sleep 1 + kill $cappid + fi + + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " client exit code $retc, server $rets" 1>&2 + echo "\nnetns ${listener_ns} socket stat for $port:" 1>&2 + ip netns exec ${listener_ns} ss -nita 1>&2 -o "sport = :$port" + echo "\nnetns ${connector_ns} socket stat for $port:" 1>&2 + ip netns exec ${connector_ns} ss -nita 1>&2 -o "dport = :$port" + + cat "$capout" + return 1 + fi + + check_transfer $sin $cout "file received by client" + retc=$? + check_transfer $cin $sout "file received by server" + rets=$? + + if [ $retc -eq 0 ] && [ $rets -eq 0 ];then + cat "$capout" + return 0 + fi + + cat "$capout" + return 1 +} + +make_file() +{ + name=$1 + who=$2 + + SIZE=1 + + dd if=/dev/urandom of="$name" bs=1024 count=$SIZE 2> /dev/null + echo -e "\nMPTCP_TEST_FILE_END_MARKER" >> "$name" + + echo "Created $name (size $SIZE KB) containing data sent by $who" +} + +run_tests() +{ + listener_ns="$1" + connector_ns="$2" + connect_addr="$3" + lret=0 + + do_transfer ${listener_ns} ${connector_ns} MPTCP MPTCP ${connect_addr} + lret=$? + if [ $lret -ne 0 ]; then + ret=$lret + return + fi +} + +chk_join_nr() +{ + local msg="$1" + local syn_nr=$2 + local syn_ack_nr=$3 + local ack_nr=$4 + local count + local dump_stats + + printf "%-36s %s" "$msg" "syn" + count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinSynRx | awk '{print $2}'` + [ -z "$count" ] && count=0 + if [ "$count" != "$syn_nr" ]; then + echo "[fail] got $count JOIN[s] syn expected $syn_nr" + ret=1 + dump_stats=1 + else + echo -n "[ ok ]" + fi + + echo -n " - synack" + count=`ip netns exec $ns2 nstat -as | grep MPTcpExtMPJoinSynAckRx | awk '{print $2}'` + [ -z "$count" ] && count=0 + if [ "$count" != "$syn_ack_nr" ]; then + echo "[fail] got $count JOIN[s] synack expected $syn_ack_nr" + ret=1 + dump_stats=1 + else + echo -n "[ ok ]" + fi + + echo -n " - ack" + count=`ip netns exec $ns1 nstat -as | grep MPTcpExtMPJoinAckRx | awk '{print $2}'` + [ -z "$count" ] && count=0 + if [ "$count" != "$ack_nr" ]; then + echo "[fail] got $count JOIN[s] ack expected $ack_nr" + ret=1 + dump_stats=1 + else + echo "[ ok ]" + fi + if [ "${dump_stats}" = 1 ]; then + echo Server ns stats + ip netns exec $ns1 nstat -as | grep MPTcp + echo Client ns stats + ip netns exec $ns2 nstat -as | grep MPTcp + fi +} + +sin=$(mktemp) +sout=$(mktemp) +cin=$(mktemp) +cout=$(mktemp) +init +make_file "$cin" "client" +make_file "$sin" "server" +trap cleanup EXIT + +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "no JOIN" "0" "0" "0" + +# subflow limted by client +reset +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "single subflow, limited by client" 0 0 0 + +# subflow limted by server +reset +ip netns exec $ns2 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "single subflow, limited by server" 1 1 0 + +# subflow +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "single subflow" 1 1 1 + +# multiple subflows +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 2 +ip netns exec $ns2 ./pm_nl_ctl limits 0 2 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "multiple subflows" 2 2 2 + +# multiple subflows limited by serverf +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl limits 0 2 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "multiple subflows, limited by server" 2 2 1 + +# add_address, unused +reset +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "unused signal address" 0 0 0 + +# accept and use add_addr +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 1 +ip netns exec $ns2 ./pm_nl_ctl limits 1 1 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "signal address" 1 1 1 + +# accept and use add_addr with an additional subflow +# note: signal address in server ns and local addresses in client ns must +# belong to different subnets or one of the listed local address could be +# used for 'add_addr' subflow +reset +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +ip netns exec $ns1 ./pm_nl_ctl limits 0 2 +ip netns exec $ns2 ./pm_nl_ctl limits 1 2 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "subflow and signal" 2 2 2 + +# accept and use add_addr with additional subflows +reset +ip netns exec $ns1 ./pm_nl_ctl limits 0 3 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal +ip netns exec $ns2 ./pm_nl_ctl limits 1 3 +ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow +ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow +run_tests $ns1 $ns2 10.0.1.1 +chk_join_nr "multiple subflows and signal" 3 3 3 + +exit $ret diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh new file mode 100755 index 000000000000..8c7bd722476e --- /dev/null +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -0,0 +1,130 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +ksft_skip=4 +ret=0 + +usage() { + echo "Usage: $0 [ -h ]" +} + + +while getopts "$optstring" option;do + case "$option" in + "h") + usage $0 + exit 0 + ;; + "?") + usage $0 + exit 1 + ;; + esac +done + +sec=$(date +%s) +rndh=$(printf %x $sec)-$(mktemp -u XXXXXX) +ns1="ns1-$rndh" +err=$(mktemp) +ret=0 + +cleanup() +{ + rm -f $out + ip netns del $ns1 +} + +ip -Version > /dev/null 2>&1 +if [ $? -ne 0 ];then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +trap cleanup EXIT + +ip netns add $ns1 || exit $ksft_skip +ip -net $ns1 link set lo up +ip netns exec $ns1 sysctl -q net.mptcp.enabled=1 + +check() +{ + local cmd="$1" + local expected="$2" + local msg="$3" + local out=`$cmd 2>$err` + local cmd_ret=$? + + printf "%-50s %s" "$msg" + if [ $cmd_ret -ne 0 ]; then + echo "[FAIL] command execution '$cmd' stderr " + cat $err + ret=1 + elif [ "$out" = "$expected" ]; then + echo "[ OK ]" + else + echo -n "[FAIL] " + echo "expected '$expected' got '$out'" + ret=1 + fi +} + +check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "defaults addr list" +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 +subflows 0" "defaults limits" + +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.1 +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.2 flags subflow dev lo +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 flags signal,backup +check "ip netns exec $ns1 ./pm_nl_ctl get 1" "id 1 flags 10.0.1.1 " "simple add/get addr" + +check "ip netns exec $ns1 ./pm_nl_ctl dump" \ +"id 1 flags 10.0.1.1 +id 2 flags subflow dev lo 10.0.1.2 +id 3 flags signal,backup 10.0.1.3 " "dump addrs" + +ip netns exec $ns1 ./pm_nl_ctl del 2 +check "ip netns exec $ns1 ./pm_nl_ctl get 2" "" "simple del addr" +check "ip netns exec $ns1 ./pm_nl_ctl dump" \ +"id 1 flags 10.0.1.1 +id 3 flags signal,backup 10.0.1.3 " "dump addrs after del" + +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.3 +check "ip netns exec $ns1 ./pm_nl_ctl get 4" "" "duplicate addr" + +ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.4 id 10 flags signal +check "ip netns exec $ns1 ./pm_nl_ctl get 4" "id 4 flags signal 10.0.1.4 " "id addr increment" + +for i in `seq 5 9`; do + ip netns exec $ns1 ./pm_nl_ctl add 10.0.1.$i flags signal >/dev/null 2>&1 +done +check "ip netns exec $ns1 ./pm_nl_ctl get 9" "id 9 flags signal 10.0.1.9 " "hard addr limit" +check "ip netns exec $ns1 ./pm_nl_ctl get 10" "" "above hard addr limit" + +for i in `seq 9 256`; do + ip netns exec $ns1 ./pm_nl_ctl del $i + ip netns exec $ns1 ./pm_nl_ctl add 10.0.0.9 +done +check "ip netns exec $ns1 ./pm_nl_ctl dump" "id 1 flags 10.0.1.1 +id 3 flags signal,backup 10.0.1.3 +id 4 flags signal 10.0.1.4 +id 5 flags signal 10.0.1.5 +id 6 flags signal 10.0.1.6 +id 7 flags signal 10.0.1.7 +id 8 flags signal 10.0.1.8 " "id limit" + +ip netns exec $ns1 ./pm_nl_ctl flush +check "ip netns exec $ns1 ./pm_nl_ctl dump" "" "flush addrs" + +ip netns exec $ns1 ./pm_nl_ctl limits 9 1 +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 +subflows 0" "rcv addrs above hard limit" + +ip netns exec $ns1 ./pm_nl_ctl limits 1 9 +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 0 +subflows 0" "subflows above hard limit" + +ip netns exec $ns1 ./pm_nl_ctl limits 8 8 +check "ip netns exec $ns1 ./pm_nl_ctl limits" "accept 8 +subflows 8" "set limits" + +exit $ret diff --git a/tools/testing/selftests/net/mptcp/pm_nl_ctl.c b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c new file mode 100644 index 000000000000..de9209305026 --- /dev/null +++ b/tools/testing/selftests/net/mptcp/pm_nl_ctl.c @@ -0,0 +1,616 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include "linux/mptcp.h" + +#ifndef MPTCP_PM_NAME +#define MPTCP_PM_NAME "mptcp_pm" +#endif + +static void syntax(char *argv[]) +{ + fprintf(stderr, "%s add|get|del|flush|dump|accept []\n", argv[0]); + fprintf(stderr, "\tadd [flags signal|subflow|backup] [id ] [dev ] \n"); + fprintf(stderr, "\tdel \n"); + fprintf(stderr, "\tget \n"); + fprintf(stderr, "\tflush\n"); + fprintf(stderr, "\tdump\n"); + fprintf(stderr, "\tlimits [ ]\n"); + exit(0); +} + +static int init_genl_req(char *data, int family, int cmd, int version) +{ + struct nlmsghdr *nh = (void *)data; + struct genlmsghdr *gh; + int off = 0; + + nh->nlmsg_type = family; + nh->nlmsg_flags = NLM_F_REQUEST; + nh->nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + off += NLMSG_ALIGN(sizeof(*nh)); + + gh = (void *)(data + off); + gh->cmd = cmd; + gh->version = version; + off += NLMSG_ALIGN(sizeof(*gh)); + return off; +} + +static void nl_error(struct nlmsghdr *nh) +{ + struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(nh); + int len = nh->nlmsg_len - sizeof(*nh); + uint32_t off; + + if (len < sizeof(struct nlmsgerr)) + error(1, 0, "netlink error message truncated %d min %ld", len, + sizeof(struct nlmsgerr)); + + if (!err->error) { + /* check messages from kernel */ + struct rtattr *attrs = (struct rtattr *)NLMSG_DATA(nh); + + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == NLMSGERR_ATTR_MSG) + fprintf(stderr, "netlink ext ack msg: %s\n", + (char *)RTA_DATA(attrs)); + if (attrs->rta_type == NLMSGERR_ATTR_OFFS) { + memcpy(&off, RTA_DATA(attrs), 4); + fprintf(stderr, "netlink err off %d\n", + (int)off); + } + attrs = RTA_NEXT(attrs, len); + } + } else { + fprintf(stderr, "netlink error %d", err->error); + } +} + +/* do a netlink command and, if max > 0, fetch the reply */ +static int do_nl_req(int fd, struct nlmsghdr *nh, int len, int max) +{ + struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK }; + socklen_t addr_len; + void *data = nh; + int rem, ret; + int err = 0; + + nh->nlmsg_len = len; + ret = sendto(fd, data, len, 0, (void *)&nladdr, sizeof(nladdr)); + if (ret != len) + error(1, errno, "send netlink: %uB != %uB\n", ret, len); + if (max == 0) + return 0; + + addr_len = sizeof(nladdr); + rem = ret = recvfrom(fd, data, max, 0, (void *)&nladdr, &addr_len); + if (ret < 0) + error(1, errno, "recv netlink: %uB\n", ret); + + /* Beware: the NLMSG_NEXT macro updates the 'rem' argument */ + for (; NLMSG_OK(nh, rem); nh = NLMSG_NEXT(nh, rem)) { + if (nh->nlmsg_type == NLMSG_ERROR) { + nl_error(nh); + err = 1; + } + } + if (err) + error(1, 0, "bailing out due to netlink error[s]"); + return ret; +} + +static int genl_parse_getfamily(struct nlmsghdr *nlh) +{ + struct genlmsghdr *ghdr = NLMSG_DATA(nlh); + int len = nlh->nlmsg_len; + struct rtattr *attrs; + + if (nlh->nlmsg_type != GENL_ID_CTRL) + error(1, errno, "Not a controller message, len=%d type=0x%x\n", + nlh->nlmsg_len, nlh->nlmsg_type); + + len -= NLMSG_LENGTH(GENL_HDRLEN); + + if (len < 0) + error(1, errno, "wrong controller message len %d\n", len); + + if (ghdr->cmd != CTRL_CMD_NEWFAMILY) + error(1, errno, "Unknown controller command %d\n", ghdr->cmd); + + attrs = (struct rtattr *) ((char *) ghdr + GENL_HDRLEN); + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == CTRL_ATTR_FAMILY_ID) + return *(__u16 *)RTA_DATA(attrs); + attrs = RTA_NEXT(attrs, len); + } + + error(1, errno, "can't find CTRL_ATTR_FAMILY_ID attr"); + return -1; +} + +static int resolve_mptcp_pm_netlink(int fd) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct nlmsghdr *nh; + struct rtattr *rta; + int namelen; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, GENL_ID_CTRL, CTRL_CMD_GETFAMILY, 0); + + rta = (void *)(data + off); + namelen = strlen(MPTCP_PM_NAME) + 1; + rta->rta_type = CTRL_ATTR_FAMILY_NAME; + rta->rta_len = RTA_LENGTH(namelen); + memcpy(RTA_DATA(rta), MPTCP_PM_NAME, namelen); + off += NLMSG_ALIGN(rta->rta_len); + + do_nl_req(fd, nh, off, sizeof(data)); + return genl_parse_getfamily((void *)data); +} + +int add_addr(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct rtattr *rta, *nest; + struct nlmsghdr *nh; + u_int16_t family; + u_int32_t flags; + int nest_start; + u_int8_t id; + int off = 0; + int arg; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_ADD_ADDR, + MPTCP_PM_VER); + + if (argc < 3) + syntax(argv); + + nest_start = off; + nest = (void *)(data + off); + nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR; + nest->rta_len = RTA_LENGTH(0); + off += NLMSG_ALIGN(nest->rta_len); + + /* addr data */ + rta = (void *)(data + off); + if (inet_pton(AF_INET, argv[2], RTA_DATA(rta))) { + family = AF_INET; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR4; + rta->rta_len = RTA_LENGTH(4); + } else if (inet_pton(AF_INET6, argv[2], RTA_DATA(rta))) { + family = AF_INET6; + rta->rta_type = MPTCP_PM_ADDR_ATTR_ADDR6; + rta->rta_len = RTA_LENGTH(16); + } else + error(1, errno, "can't parse ip %s", argv[2]); + off += NLMSG_ALIGN(rta->rta_len); + + /* family */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_FAMILY; + rta->rta_len = RTA_LENGTH(2); + memcpy(RTA_DATA(rta), &family, 2); + off += NLMSG_ALIGN(rta->rta_len); + + for (arg = 3; arg < argc; arg++) { + if (!strcmp(argv[arg], "flags")) { + char *tok, *str; + + /* flags */ + flags = 0; + if (++arg >= argc) + error(1, 0, " missing flags value"); + + /* do not support flag list yet */ + for (str = argv[arg]; (tok = strtok(str, ",")); + str = NULL) { + if (!strcmp(tok, "subflow")) + flags |= MPTCP_PM_ADDR_FLAG_SUBFLOW; + else if (!strcmp(tok, "signal")) + flags |= MPTCP_PM_ADDR_FLAG_SIGNAL; + else if (!strcmp(tok, "backup")) + flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + else + error(1, errno, + "unknown flag %s", argv[arg]); + } + + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_FLAGS; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &flags, 4); + off += NLMSG_ALIGN(rta->rta_len); + } else if (!strcmp(argv[arg], "id")) { + if (++arg >= argc) + error(1, 0, " missing id value"); + + id = atoi(argv[arg]); + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_ID; + rta->rta_len = RTA_LENGTH(1); + memcpy(RTA_DATA(rta), &id, 1); + off += NLMSG_ALIGN(rta->rta_len); + } else if (!strcmp(argv[arg], "dev")) { + int32_t ifindex; + + if (++arg >= argc) + error(1, 0, " missing dev name"); + + ifindex = if_nametoindex(argv[arg]); + if (!ifindex) + error(1, errno, "unknown device %s", argv[arg]); + + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_IF_IDX; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &ifindex, 4); + off += NLMSG_ALIGN(rta->rta_len); + } else + error(1, 0, "unknown keyword %s", argv[arg]); + } + nest->rta_len = off - nest_start; + + do_nl_req(fd, nh, off, 0); + return 0; +} + +int del_addr(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct rtattr *rta, *nest; + struct nlmsghdr *nh; + int nest_start; + u_int8_t id; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_DEL_ADDR, + MPTCP_PM_VER); + + /* the only argument is the address id */ + if (argc != 3) + syntax(argv); + + id = atoi(argv[2]); + + nest_start = off; + nest = (void *)(data + off); + nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR; + nest->rta_len = RTA_LENGTH(0); + off += NLMSG_ALIGN(nest->rta_len); + + /* build a dummy addr with only the ID set */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_ID; + rta->rta_len = RTA_LENGTH(1); + memcpy(RTA_DATA(rta), &id, 1); + off += NLMSG_ALIGN(rta->rta_len); + nest->rta_len = off - nest_start; + + do_nl_req(fd, nh, off, 0); + return 0; +} + +static void print_addr(struct rtattr *attrs, int len) +{ + uint16_t family = 0; + char str[1024]; + uint32_t flags; + uint8_t id; + + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_FAMILY) + memcpy(&family, RTA_DATA(attrs), 2); + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ADDR4) { + if (family != AF_INET) + error(1, errno, "wrong IP (v4) for family %d", + family); + inet_ntop(AF_INET, RTA_DATA(attrs), str, sizeof(str)); + printf("%s ", str); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ADDR6) { + if (family != AF_INET6) + error(1, errno, "wrong IP (v6) for family %d", + family); + inet_ntop(AF_INET6, RTA_DATA(attrs), str, sizeof(str)); + printf("%s ", str); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_ID) { + memcpy(&id, RTA_DATA(attrs), 1); + printf("id %d ", id); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_FLAGS) { + memcpy(&flags, RTA_DATA(attrs), 4); + + printf("flags "); + if (flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { + printf("signal"); + flags &= ~MPTCP_PM_ADDR_FLAG_SIGNAL; + if (flags) + printf(","); + } + + if (flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + printf("subflow"); + flags &= ~MPTCP_PM_ADDR_FLAG_SUBFLOW; + if (flags) + printf(","); + } + + if (flags & MPTCP_PM_ADDR_FLAG_BACKUP) { + printf("backup"); + flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; + if (flags) + printf(","); + } + + /* bump unknown flags, if any */ + if (flags) + printf("0x%x", flags); + printf(" "); + } + if (attrs->rta_type == MPTCP_PM_ADDR_ATTR_IF_IDX) { + char name[IF_NAMESIZE], *ret; + int32_t ifindex; + + memcpy(&ifindex, RTA_DATA(attrs), 4); + ret = if_indextoname(ifindex, name); + if (ret) + printf("dev %s ", ret); + else + printf("dev unknown/%d", ifindex); + } + + attrs = RTA_NEXT(attrs, len); + } + printf("\n"); +} + +static void print_addrs(struct nlmsghdr *nh, int pm_family, int total_len) +{ + struct rtattr *attrs; + + for (; NLMSG_OK(nh, total_len); nh = NLMSG_NEXT(nh, total_len)) { + int len = nh->nlmsg_len; + + if (nh->nlmsg_type == NLMSG_DONE) + break; + if (nh->nlmsg_type == NLMSG_ERROR) + nl_error(nh); + if (nh->nlmsg_type != pm_family) + continue; + + len -= NLMSG_LENGTH(GENL_HDRLEN); + attrs = (struct rtattr *) ((char *) NLMSG_DATA(nh) + + GENL_HDRLEN); + while (RTA_OK(attrs, len)) { + if (attrs->rta_type == + (MPTCP_PM_ATTR_ADDR | NLA_F_NESTED)) + print_addr((void *)RTA_DATA(attrs), + attrs->rta_len); + attrs = RTA_NEXT(attrs, len); + } + } +} + +int get_addr(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct rtattr *rta, *nest; + struct nlmsghdr *nh; + int nest_start; + u_int8_t id; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_GET_ADDR, + MPTCP_PM_VER); + + /* the only argument is the address id */ + if (argc != 3) + syntax(argv); + + id = atoi(argv[2]); + + nest_start = off; + nest = (void *)(data + off); + nest->rta_type = NLA_F_NESTED | MPTCP_PM_ATTR_ADDR; + nest->rta_len = RTA_LENGTH(0); + off += NLMSG_ALIGN(nest->rta_len); + + /* build a dummy addr with only the ID set */ + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ADDR_ATTR_ID; + rta->rta_len = RTA_LENGTH(1); + memcpy(RTA_DATA(rta), &id, 1); + off += NLMSG_ALIGN(rta->rta_len); + nest->rta_len = off - nest_start; + + print_addrs(nh, pm_family, do_nl_req(fd, nh, off, sizeof(data))); + return 0; +} + +int dump_addrs(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + pid_t pid = getpid(); + struct nlmsghdr *nh; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_GET_ADDR, + MPTCP_PM_VER); + nh->nlmsg_flags |= NLM_F_DUMP; + nh->nlmsg_seq = 1; + nh->nlmsg_pid = pid; + nh->nlmsg_len = off; + + print_addrs(nh, pm_family, do_nl_req(fd, nh, off, sizeof(data))); + return 0; +} + +int flush_addrs(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + struct nlmsghdr *nh; + int off = 0; + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, MPTCP_PM_CMD_FLUSH_ADDRS, + MPTCP_PM_VER); + + do_nl_req(fd, nh, off, 0); + return 0; +} + +static void print_limits(struct nlmsghdr *nh, int pm_family, int total_len) +{ + struct rtattr *attrs; + uint32_t max; + + for (; NLMSG_OK(nh, total_len); nh = NLMSG_NEXT(nh, total_len)) { + int len = nh->nlmsg_len; + + if (nh->nlmsg_type == NLMSG_DONE) + break; + if (nh->nlmsg_type == NLMSG_ERROR) + nl_error(nh); + if (nh->nlmsg_type != pm_family) + continue; + + len -= NLMSG_LENGTH(GENL_HDRLEN); + attrs = (struct rtattr *) ((char *) NLMSG_DATA(nh) + + GENL_HDRLEN); + while (RTA_OK(attrs, len)) { + int type = attrs->rta_type; + + if (type != MPTCP_PM_ATTR_RCV_ADD_ADDRS && + type != MPTCP_PM_ATTR_SUBFLOWS) + goto next; + + memcpy(&max, RTA_DATA(attrs), 4); + printf("%s %u\n", type == MPTCP_PM_ATTR_SUBFLOWS ? + "subflows" : "accept", max); + +next: + attrs = RTA_NEXT(attrs, len); + } + } +} + +int get_set_limits(int fd, int pm_family, int argc, char *argv[]) +{ + char data[NLMSG_ALIGN(sizeof(struct nlmsghdr)) + + NLMSG_ALIGN(sizeof(struct genlmsghdr)) + + 1024]; + uint32_t rcv_addr = 0, subflows = 0; + int cmd, len = sizeof(data); + struct nlmsghdr *nh; + int off = 0; + + /* limit */ + if (argc == 4) { + rcv_addr = atoi(argv[2]); + subflows = atoi(argv[3]); + cmd = MPTCP_PM_CMD_SET_LIMITS; + } else { + cmd = MPTCP_PM_CMD_GET_LIMITS; + } + + memset(data, 0, sizeof(data)); + nh = (void *)data; + off = init_genl_req(data, pm_family, cmd, MPTCP_PM_VER); + + /* limit */ + if (cmd == MPTCP_PM_CMD_SET_LIMITS) { + struct rtattr *rta = (void *)(data + off); + + rta->rta_type = MPTCP_PM_ATTR_RCV_ADD_ADDRS; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &rcv_addr, 4); + off += NLMSG_ALIGN(rta->rta_len); + + rta = (void *)(data + off); + rta->rta_type = MPTCP_PM_ATTR_SUBFLOWS; + rta->rta_len = RTA_LENGTH(4); + memcpy(RTA_DATA(rta), &subflows, 4); + off += NLMSG_ALIGN(rta->rta_len); + + /* do not expect a reply */ + len = 0; + } + + len = do_nl_req(fd, nh, off, len); + if (cmd == MPTCP_PM_CMD_GET_LIMITS) + print_limits(nh, pm_family, len); + return 0; +} + +int main(int argc, char *argv[]) +{ + int fd, pm_family; + + if (argc < 2) + syntax(argv); + + fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + if (fd == -1) + error(1, errno, "socket netlink"); + + pm_family = resolve_mptcp_pm_netlink(fd); + + if (!strcmp(argv[1], "add")) + return add_addr(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "del")) + return del_addr(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "flush")) + return flush_addrs(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "get")) + return get_addr(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "dump")) + return dump_addrs(fd, pm_family, argc, argv); + else if (!strcmp(argv[1], "limits")) + return get_set_limits(fd, pm_family, argc, argv); + + fprintf(stderr, "unknown sub-command: %s", argv[1]); + syntax(argv); + return 0; +}