From ba1a6c7bc0ff33e405f5156dc8f4145437255f1f Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sat, 23 Aug 2008 13:28:27 +0200 Subject: [PATCH 001/105] dccp: Always generate a Reset in response to option errors RFC4340 states that if a packet is received with an option error (such as a Mandatory Option as the last byte of the option list), the endpoint should repond with a Reset. In the LISTEN and RESPOND states, the endpoint correctly reponds with Reset, while in the REQUEST/OPEN states, packets with option errors are just ignored. The packet sequence is as follows: Case 1: Endpoint A Endpoint B (CLOSED) (CLOSED) <---------------- REQUEST RESPONSE -----------------> (*1) (with invalid option) <---------------- RESET (with Reset Code 5, "Option Error") (*1) currently just ignored, no Reset is sent Case 2: Endpoint A Endpoint B (OPEN) (OPEN) DATA-ACK -----------------> (*2) (with invalid option) <---------------- RESET (with Reset Code 5, "Option Error") (*2) currently just ignored, no Reset is sent This patch fixes the problem, by generating a Reset instead of silently ignoring option errors. Signed-off-by: Wei Yongjun Acked-by: Arnaldo Carvalho de Melo Acked-by: Gerrit Renker --- net/dccp/input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/dccp/input.c b/net/dccp/input.c index 803933ab396d..779d0ed9ae94 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -370,7 +370,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, goto discard; if (dccp_parse_options(sk, NULL, skb)) - goto discard; + return 1; if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); @@ -610,7 +610,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * Step 8: Process options and mark acknowledgeable */ if (dccp_parse_options(sk, NULL, skb)) - goto discard; + return 1; if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); From faf61c3319ea336ed47acd6ca86faaaa3a8f4937 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sat, 23 Aug 2008 13:28:27 +0200 Subject: [PATCH 002/105] dccp: Silently ignore options with nonsensical lengths This updates the option-parsing code with regard to RFC 4340, 5.8: "[..] options with nonsensical lengths (length byte less than two or more than the remaining space in the options portion of the header) MUST be ignored, and any option space following an option with nonsensical length MUST likewise be ignored." Hence in the following cases erratic options will be ignored: 1. The type byte of a multi-byte option is the last byte of the header options (i.e. effective option length of 1). 2. The value of the length byte is less than the minimum 2. This has been changed from previously 3: although no multi-byte option with a length less than 3 yet exists (cf. table 3 in 5.8), a length of 2 is valid. (The switch-statement in dccp_parse has further per-option length checks.) 3. The option length exceeds the length of the remaining option space. Signed-off-by: Gerrit Renker --- net/dccp/options.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/dccp/options.c b/net/dccp/options.c index dc7c158a2f4b..4284f0856047 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -81,11 +81,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, /* Check if this isn't a single byte option */ if (opt > DCCPO_MAX_RESERVED) { if (opt_ptr == opt_end) - goto out_invalid_option; + goto out_nonsensical_length; len = *opt_ptr++; - if (len < 3) - goto out_invalid_option; + if (len < 2) + goto out_nonsensical_length; /* * Remove the type and len fields, leaving * just the value size @@ -95,7 +95,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, opt_ptr += len; if (opt_ptr > opt_end) - goto out_invalid_option; + goto out_nonsensical_length; } /* @@ -283,6 +283,8 @@ ignore_option: if (mandatory) goto out_invalid_option; +out_nonsensical_length: + /* RFC 4340, 5.8: ignore option and all remaining option space */ return 0; out_invalid_option: From eac7726bf5cd24440d84b166e0813668d1bf3224 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sat, 23 Aug 2008 13:28:27 +0200 Subject: [PATCH 003/105] dccp: Fill in the Data fields for "Option Error" Resets This updates the use of the `out_invalid_option' label, which produces a Reset (code 5, "Option Error"), to fill in the Data1...Data3 fields as specified in RFC 4340, 5.6. Signed-off-by: Gerrit Renker --- net/dccp/options.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/dccp/options.c b/net/dccp/options.c index 4284f0856047..0809b63cb055 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -291,6 +291,9 @@ out_invalid_option: DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR; DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len); + DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; + DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; + DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; return -1; } From 48816322ad4d9ce195aaddd10f0ce98c944af193 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sat, 23 Aug 2008 13:28:27 +0200 Subject: [PATCH 004/105] dccp: Empty the write queue when disconnecting dccp_disconnect() can be called due to several reasons: 1. when the connection setup failed (inet_stream_connect()); 2. when shutting down (inet_shutdown(), inet_csk_listen_stop()); 3. when aborting the connection (dccp_close() with 0 linger time). In case (1) the write queue is empty. This patch empties the write queue, if in case (2) or (3) it was not yet empty. This avoids triggering the write-queue BUG_TRAP in sk_stream_kill_queues() later on. It also seems natural to do: when breaking an association, to delete all packets that were originally intended for the soon-disconnected end (compare with call to tcp_write_queue_purge in tcp_disconnect()). Signed-off-by: Gerrit Renker --- net/dccp/proto.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 1ca3b26eed0f..ae66473b11fa 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -309,7 +309,9 @@ int dccp_disconnect(struct sock *sk, int flags) sk->sk_err = ECONNRESET; dccp_clear_xmit_timers(sk); + __skb_queue_purge(&sk->sk_receive_queue); + __skb_queue_purge(&sk->sk_write_queue); if (sk->sk_send_head != NULL) { __kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; From 432649916b0435b608fb3e1fcb97347ac294d38d Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sat, 23 Aug 2008 13:28:27 +0200 Subject: [PATCH 005/105] dccp: Toggle debug output without module unloading This sets the sysfs permissions so that root can toggle the `debug' parameter available for nearly every DCCP module. This is useful since there are various module inter-dependencies. The debug flag can now be toggled at runtime using echo 1 > /sys/module/dccp/parameters/dccp_debug echo 1 > /sys/module/dccp_ccid2/parameters/ccid2_debug echo 1 > /sys/module/dccp_ccid3/parameters/ccid3_debug echo 1 > /sys/module/dccp_tfrc_lib/parameters/tfrc_debug The last is not very useful yet, since no code at the moment calls the tfrc_debug() macro. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid2.c | 2 +- net/dccp/ccids/ccid3.c | 2 +- net/dccp/ccids/lib/tfrc.c | 2 +- net/dccp/proto.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 8e9580874216..9a430734530c 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -783,7 +783,7 @@ static struct ccid_operations ccid2 = { }; #ifdef CONFIG_IP_DCCP_CCID2_DEBUG -module_param(ccid2_debug, bool, 0444); +module_param(ccid2_debug, bool, 0644); MODULE_PARM_DESC(ccid2_debug, "Enable debug messages"); #endif diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index f6756e0c9e69..3b8bd7ca6761 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -963,7 +963,7 @@ static struct ccid_operations ccid3 = { }; #ifdef CONFIG_IP_DCCP_CCID3_DEBUG -module_param(ccid3_debug, bool, 0444); +module_param(ccid3_debug, bool, 0644); MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); #endif diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c index 97ecec0a8e76..185916218e07 100644 --- a/net/dccp/ccids/lib/tfrc.c +++ b/net/dccp/ccids/lib/tfrc.c @@ -10,7 +10,7 @@ #ifdef CONFIG_IP_DCCP_TFRC_DEBUG int tfrc_debug; -module_param(tfrc_debug, bool, 0444); +module_param(tfrc_debug, bool, 0644); MODULE_PARM_DESC(tfrc_debug, "Enable debug messages"); #endif diff --git a/net/dccp/proto.c b/net/dccp/proto.c index ae66473b11fa..d0bd34819761 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1030,7 +1030,7 @@ MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); #ifdef CONFIG_IP_DCCP_DEBUG int dccp_debug; -module_param(dccp_debug, bool, 0444); +module_param(dccp_debug, bool, 0644); MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); EXPORT_SYMBOL_GPL(dccp_debug); From 959fd992f05b7468bf30d759ac0c9fd0ef0fa80b Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sat, 23 Aug 2008 13:28:27 +0200 Subject: [PATCH 006/105] dccp ccid-3: Replace lazy BUG_ON with condition The BUG_ON(w_tot == 0) only holds if there is no more than 1 loss interval in the loss history. If there is only a single loss interval, the calc_i_mean() routine need in fact not be called (RFC 3448, 6.3.1). Signed-off-by: Gerrit Renker --- net/dccp/ccids/lib/loss_interval.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index bcd6ac415bb9..5b3ce0688c5c 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -67,7 +67,10 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0; int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */ - for (i=0; i <= k; i++) { + if (k <= 0) + return; + + for (i = 0; i <= k; i++) { i_i = tfrc_lh_get_interval(lh, i); if (i < k) { @@ -78,7 +81,6 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) i_tot1 += i_i * tfrc_lh_weights[i-1]; } - BUG_ON(w_tot == 0); lh->i_mean = max(i_tot0, i_tot1) / w_tot; } From 5c7c9451f1f422b69bf0e161e471dd3976ecd408 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 007/105] dccp: Basic data structure for feature negotiation This patch prepares for the new and extended feature-negotiation routines. The following feature-negotiation data structures are provided: * a container for the various (SP or NN) values, * symbolic state names to track feature states, * an entry struct which holds all current information together, * elementary functions to fill in and process these structures. Entry structs are arranged as FIFO for the following reason: RFC 4340 specifies that if multiple options of the same type are present, they are processed in the order of their appearance in the packet; which means that this order needs to be preserved in the local data structure (the later insertion code also respects this order). The struct list_head has been chosen for the following reasons: the most frequent operations are * add new entry at tail (when receiving Change or setting socket options); * delete entry (when Confirm has been received); * deep copy of entire list (cloning from listening socket onto request socket). The NN value has been set to 64 bit, which is a currently sufficient upper limit (Sequence Window feature has 48 bit). Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/feat.c | 14 ++++++++++++ net/dccp/feat.h | 60 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 933a0ecf8d46..94a81b8e5efb 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -23,6 +23,20 @@ #define DCCP_FEAT_SP_NOAGREE (-123) +/* copy constructor, fval must not already contain allocated memory */ +static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) +{ + fval->sp.len = len; + if (fval->sp.len > 0) { + fval->sp.vec = kmemdup(val, len, gfp_any()); + if (fval->sp.vec == NULL) { + fval->sp.len = 0; + return -ENOBUFS; + } + } + return 0; +} + int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, u8 *val, u8 len, gfp_t gfp) { diff --git a/net/dccp/feat.h b/net/dccp/feat.h index e272222c7ace..94203c230c65 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -14,6 +14,66 @@ #include #include "dccp.h" +enum dccp_feat_type { + FEAT_AT_RX = 1, /* located at RX side of half-connection */ + FEAT_AT_TX = 2, /* located at TX side of half-connection */ + FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */ + FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */ + FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */ +}; + +enum dccp_feat_state { + FEAT_DEFAULT = 0, /* using default values from 6.4 */ + FEAT_INITIALISING, /* feature is being initialised */ + FEAT_CHANGING, /* Change sent but not confirmed yet */ + FEAT_UNSTABLE, /* local modification in state CHANGING */ + FEAT_STABLE /* both ends (think they) agree */ +}; + +/** + * dccp_feat_val - Container for SP or NN feature values + * @nn: single NN value + * @sp.vec: single SP value plus optional preference list + * @sp.len: length of @sp.vec in bytes + */ +typedef union { + u64 nn; + struct { + u8 *vec; + u8 len; + } sp; +} dccp_feat_val; + +/** + * struct feat_entry - Data structure to perform feature negotiation + * @feat_num: one of %dccp_feature_numbers + * @val: feature's current value (SP features may have preference list) + * @state: feature's current state + * @needs_mandatory: whether Mandatory options should be sent + * @needs_confirm: whether to send a Confirm instead of a Change + * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm) + * @is_local: feature location (1) or feature-remote (0) + * @node: list pointers, entries arranged in FIFO order + */ +struct dccp_feat_entry { + u8 feat_num; + dccp_feat_val val; + enum dccp_feat_state state:8; + bool needs_mandatory:1, + needs_confirm:1, + empty_confirm:1, + is_local:1; + + struct list_head node; +}; + +static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry) +{ + if (entry->needs_confirm) + return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R; + return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R; +} + #ifdef CONFIG_IP_DCCP_DEBUG extern const char *dccp_feat_typename(const u8 type); extern const char *dccp_feat_name(const u8 feat); From b4eec206370b7154dc354dc30f0a3f02ea8468b2 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 008/105] dccp: Implement lookup table for feature-negotiation information A lookup table for feature-negotiation information, extracted from RFC 4340/42, is provided by this patch. All currently known features can be found in this table, along with their feature location, their default value, and type. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- include/linux/dccp.h | 9 ++-- net/dccp/feat.c | 115 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 4 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 6080449fbec9..3978aff197d9 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -176,19 +176,20 @@ enum { }; /* DCCP features (RFC 4340 section 6.4) */ -enum { +enum dccp_feature_numbers { DCCPF_RESERVED = 0, DCCPF_CCID = 1, - DCCPF_SHORT_SEQNOS = 2, /* XXX: not yet implemented */ + DCCPF_SHORT_SEQNOS = 2, DCCPF_SEQUENCE_WINDOW = 3, - DCCPF_ECN_INCAPABLE = 4, /* XXX: not yet implemented */ + DCCPF_ECN_INCAPABLE = 4, DCCPF_ACK_RATIO = 5, DCCPF_SEND_ACK_VECTOR = 6, DCCPF_SEND_NDP_COUNT = 7, DCCPF_MIN_CSUM_COVER = 8, - DCCPF_DATA_CHECKSUM = 9, /* XXX: not yet implemented */ + DCCPF_DATA_CHECKSUM = 9, /* 10-127 reserved */ DCCPF_MIN_CCID_SPECIFIC = 128, + DCCPF_SEND_LEV_RATE = 192, /* RFC 4342, sec. 8.4 */ DCCPF_MAX_CCID_SPECIFIC = 255, }; diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 94a81b8e5efb..d7468f70544d 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -23,6 +23,80 @@ #define DCCP_FEAT_SP_NOAGREE (-123) +static const struct { + u8 feat_num; /* DCCPF_xxx */ + enum dccp_feat_type rxtx; /* RX or TX */ + enum dccp_feat_type reconciliation; /* SP or NN */ + u8 default_value; /* as in 6.4 */ +/* + * Lookup table for location and type of features (from RFC 4340/4342) + * +--------------------------+----+-----+----+----+---------+-----------+ + * | Feature | Location | Reconc. | Initial | Section | + * | | RX | TX | SP | NN | Value | Reference | + * +--------------------------+----+-----+----+----+---------+-----------+ + * | DCCPF_CCID | | X | X | | 2 | 10 | + * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 | + * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 | + * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 | + * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 | + * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 | + * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 | + * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 | + * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 | + * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 | + * +--------------------------+----+-----+----+----+---------+-----------+ + */ +} dccp_feat_table[] = { + { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2 }, + { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0 }, + { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100 }, + { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0 }, + { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2 }, + { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0 }, + { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0 }, + { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0 }, + { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0 }, + { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0 }, +}; +#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) + +/** + * dccp_feat_index - Hash function to map feature number into array position + * Returns consecutive array index or -1 if the feature is not understood. + */ +static int dccp_feat_index(u8 feat_num) +{ + /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ + if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) + return feat_num - 1; + + /* + * Other features: add cases for new feature types here after adding + * them to the above table. + */ + switch (feat_num) { + case DCCPF_SEND_LEV_RATE: + return DCCP_FEAT_SUPPORTED_MAX - 1; + } + return -1; +} + +static u8 dccp_feat_type(u8 feat_num) +{ + int idx = dccp_feat_index(feat_num); + + if (idx < 0) + return FEAT_UNKNOWN; + return dccp_feat_table[idx].reconciliation; +} + +static int dccp_feat_default_value(u8 feat_num) +{ + int idx = dccp_feat_index(feat_num); + + return idx < 0 ? : dccp_feat_table[idx].default_value; +} + /* copy constructor, fval must not already contain allocated memory */ static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) { @@ -37,6 +111,45 @@ static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) return 0; } +static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) +{ + if (unlikely(val == NULL)) + return; + if (dccp_feat_type(feat_num) == FEAT_SP) + kfree(val->sp.vec); + memset(val, 0, sizeof(*val)); +} + +static struct dccp_feat_entry * + dccp_feat_clone_entry(struct dccp_feat_entry const *original) +{ + struct dccp_feat_entry *new; + u8 type = dccp_feat_type(original->feat_num); + + if (type == FEAT_UNKNOWN) + return NULL; + + new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); + if (new == NULL) + return NULL; + + if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, + original->val.sp.vec, + original->val.sp.len)) { + kfree(new); + return NULL; + } + return new; +} + +static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) +{ + if (entry != NULL) { + dccp_feat_val_destructor(entry->feat_num, &entry->val); + kfree(entry); + } +} + int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, u8 *val, u8 len, gfp_t gfp) { @@ -653,6 +766,8 @@ const char *dccp_feat_name(const u8 feat) if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) return feature_names[DCCPF_RESERVED]; + if (feat == DCCPF_SEND_LEV_RATE) + return "Send Loss Event Rate"; if (feat >= DCCPF_MIN_CCID_SPECIFIC) return "CCID-specific"; From 3001fc0569651f2d0c3b45adc991351471b0c382 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 009/105] dccp: List management for new feature negotiation This adds list fields and list management functions for the new feature negotiation implementation. The new code is kept in parallel to the old code, until removed at the end of the patch set. Thanks to Arnaldo for suggestions to improve the code. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/feat.c | 129 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/net/dccp/feat.c b/net/dccp/feat.c index d7468f70544d..2ec2cd117699 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -150,6 +150,135 @@ static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) } } +/* + * List management functions + * + * Feature negotiation lists rely on and maintain the following invariants: + * - each feat_num in the list is known, i.e. we know its type and default value + * - each feat_num/is_local combination is unique (old entries are overwritten) + * - SP values are always freshly allocated + * - list is sorted in increasing order of feature number (faster lookup) + */ +static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, + u8 feat_num, bool is_local) +{ + struct dccp_feat_entry *entry; + + list_for_each_entry(entry, fn_list, node) + if (entry->feat_num == feat_num && entry->is_local == is_local) + return entry; + else if (entry->feat_num > feat_num) + break; + return NULL; +} + +/** + * dccp_feat_entry_new - Central list update routine (called by all others) + * @head: list to add to + * @feat: feature number + * @local: whether the local (1) or remote feature with number @feat is meant + * This is the only constructor and serves to ensure the above invariants. + */ +static struct dccp_feat_entry * + dccp_feat_entry_new(struct list_head *head, u8 feat, bool local) +{ + struct dccp_feat_entry *entry; + + list_for_each_entry(entry, head, node) + if (entry->feat_num == feat && entry->is_local == local) { + dccp_feat_val_destructor(entry->feat_num, &entry->val); + return entry; + } else if (entry->feat_num > feat) { + head = &entry->node; + break; + } + + entry = kmalloc(sizeof(*entry), gfp_any()); + if (entry != NULL) { + entry->feat_num = feat; + entry->is_local = local; + list_add_tail(&entry->node, head); + } + return entry; +} + +/** + * dccp_feat_push_change - Add/overwrite a Change option in the list + * @fn_list: feature-negotiation list to update + * @feat: one of %dccp_feature_numbers + * @local: whether local (1) or remote (0) @feat_num is meant + * @needs_mandatory: whether to use Mandatory feature negotiation options + * @fval: pointer to NN/SP value to be inserted (will be copied) + */ +static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, + u8 mandatory, dccp_feat_val *fval) +{ + struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); + + if (new == NULL) + return -ENOMEM; + + new->feat_num = feat; + new->is_local = local; + new->state = FEAT_INITIALISING; + new->needs_confirm = 0; + new->empty_confirm = 0; + new->val = *fval; + new->needs_mandatory = mandatory; + + return 0; +} + +/** + * dccp_feat_push_confirm - Add a Confirm entry to the FN list + * @fn_list: feature-negotiation list to add to + * @feat: one of %dccp_feature_numbers + * @local: whether local (1) or remote (0) @feat_num is being confirmed + * @fval: pointer to NN/SP value to be inserted or NULL + * Returns 0 on success, a Reset code for further processing otherwise. + */ +static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, + dccp_feat_val *fval) +{ + struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); + + if (new == NULL) + return DCCP_RESET_CODE_TOO_BUSY; + + new->feat_num = feat; + new->is_local = local; + new->state = FEAT_STABLE; /* transition in 6.6.2 */ + new->needs_confirm = 1; + new->empty_confirm = (fval == NULL); + new->val.nn = 0; /* zeroes the whole structure */ + if (!new->empty_confirm) + new->val = *fval; + new->needs_mandatory = 0; + + return 0; +} + +static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) +{ + return dccp_feat_push_confirm(fn_list, feat, local, NULL); +} + +static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) +{ + list_del(&entry->node); + dccp_feat_entry_destructor(entry); +} + +void dccp_feat_list_purge(struct list_head *fn_list) +{ + struct dccp_feat_entry *entry, *next; + + list_for_each_entry_safe(entry, next, fn_list, node) + dccp_feat_entry_destructor(entry); + INIT_LIST_HEAD(fn_list); +} +EXPORT_SYMBOL_GPL(dccp_feat_list_purge); + int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, u8 *val, u8 len, gfp_t gfp) { From 828755cee087e4a34f45d6c9db661ccd0631cc6d Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 010/105] dccp: Per-socket initialisation of feature negotiation This provides feature-negotiation initialisation for both DCCP sockets and DCCP request_sockets, to support feature negotiation during connection setup. It also resolves a FIXME regarding the congestion control initialisation. Thanks to Wei Yongjun for help with the IPv6 side of this patch. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- include/linux/dccp.h | 4 ++++ net/dccp/dccp.h | 3 ++- net/dccp/feat.c | 19 +++++++++++++++++++ net/dccp/feat.h | 1 + net/dccp/input.c | 2 -- net/dccp/ipv4.c | 3 ++- net/dccp/ipv6.c | 3 ++- net/dccp/minisocks.c | 7 ++++++- net/dccp/proto.c | 1 + 9 files changed, 37 insertions(+), 6 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 3978aff197d9..484b8a1fb023 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -412,6 +412,7 @@ extern void dccp_minisock_init(struct dccp_minisock *dmsk); * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) * @dreq_isr: initial sequence number received on the Request * @dreq_service: service code present on the Request (there is just one) + * @dreq_featneg: feature negotiation options for this connection * The following two fields are analogous to the ones in dccp_sock: * @dreq_timestamp_echo: last received timestamp to echo (13.1) * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo @@ -421,6 +422,7 @@ struct dccp_request_sock { __u64 dreq_iss; __u64 dreq_isr; __be32 dreq_service; + struct list_head dreq_featneg; __u32 dreq_timestamp_echo; __u32 dreq_timestamp_time; }; @@ -498,6 +500,7 @@ struct dccp_ackvec; * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) * @dccps_minisock - associated minisock (accessed via dccp_msk) + * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) * @dccps_hc_rx_ackvec - rx half connection ack vector * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) @@ -535,6 +538,7 @@ struct dccp_sock { __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; struct dccp_minisock dccps_minisock; + struct list_head dccps_featneg; struct dccp_ackvec *dccps_hc_rx_ackvec; struct ccid *dccps_hc_rx_ccid; struct ccid *dccps_hc_tx_ccid; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index b4bc6e095a0e..ab096c06bba0 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -252,7 +252,8 @@ extern const char *dccp_state_name(const int state); extern void dccp_set_state(struct sock *sk, const int state); extern void dccp_done(struct sock *sk); -extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb); +extern int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, + struct sk_buff const *skb); extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 2ec2cd117699..faade82856fe 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -279,6 +279,25 @@ void dccp_feat_list_purge(struct list_head *fn_list) } EXPORT_SYMBOL_GPL(dccp_feat_list_purge); +/* generate @to as full clone of @from - @to must not contain any nodes */ +int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) +{ + struct dccp_feat_entry *entry, *new; + + INIT_LIST_HEAD(to); + list_for_each_entry(entry, from, node) { + new = dccp_feat_clone_entry(entry); + if (new == NULL) + goto cloning_failed; + list_add_tail(&new->node, to); + } + return 0; + +cloning_failed: + dccp_feat_list_purge(to); + return -ENOMEM; +} + int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, u8 *val, u8 len, gfp_t gfp) { diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 94203c230c65..7e953fd0a79b 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -95,6 +95,7 @@ extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len); extern void dccp_feat_clean(struct dccp_minisock *dmsk); extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); +extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); extern int dccp_feat_init(struct dccp_minisock *dmsk); #endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/input.c b/net/dccp/input.c index 779d0ed9ae94..3070015edc75 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -590,8 +590,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) < 0) return 1; - - /* FIXME: do congestion control initialization */ goto discard; } if (dh->dccph_type == DCCP_PKT_RESET) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 882c5c4de69e..0ce84ea89119 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -595,7 +595,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - dccp_reqsk_init(req, skb); + if (dccp_reqsk_init(req, dccp_sk(sk), skb)) + goto drop_and_free; dreq = dccp_rsk(req); if (dccp_parse_options(sk, dreq, skb)) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 5e1ee0da2c40..33e8a1ea3041 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -424,7 +424,8 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - dccp_reqsk_init(req, skb); + if (dccp_reqsk_init(req, dccp_sk(sk), skb)) + goto drop_and_free; dreq = dccp_rsk(req); if (dccp_parse_options(sk, dreq, skb)) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index b2804e2d1b8c..e487133ae079 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -125,6 +125,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk, newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; newicsk->icsk_rto = DCCP_TIMEOUT_INIT; + INIT_LIST_HEAD(&newdp->dccps_featneg); if (dccp_feat_clone(sk, newsk)) goto out_free; @@ -304,7 +305,8 @@ void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); -void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) +int dccp_reqsk_init(struct request_sock *req, + struct dccp_sock const *dp, struct sk_buff const *skb) { struct dccp_request_sock *dreq = dccp_rsk(req); @@ -312,6 +314,9 @@ void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) inet_rsk(req)->acked = 0; req->rcv_wnd = sysctl_dccp_feat_sequence_window; dreq->dreq_timestamp_echo = 0; + + /* inherit feature negotiation options from listening socket */ + return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); } EXPORT_SYMBOL_GPL(dccp_reqsk_init); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index d0bd34819761..1cdf4ae99605 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -193,6 +193,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) dccp_init_xmit_timers(sk); + INIT_LIST_HEAD(&dp->dccps_featneg); /* * FIXME: We're hardcoding the CCID, and doing this at this point makes * the listening (master) sock get CCID control blocks, which is not From 702083839b607f390dbed5d2304eb8fc5f4c85ac Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 011/105] dccp: Cleanup routines for feature negotiation This inserts the required de-allocation routines for memory allocated by feature negotiation in the socket destructors, replacing dccp_feat_clean() in one instance. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/dccp.h | 2 ++ net/dccp/ipv4.c | 1 + net/dccp/ipv6.c | 1 + net/dccp/proto.c | 2 +- 4 files changed, 5 insertions(+), 1 deletion(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index ab096c06bba0..dee4a90886d6 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -442,6 +442,8 @@ static inline int dccp_ack_pending(const struct sock *sk) inet_csk_ack_scheduled(sk); } +extern void dccp_feat_list_purge(struct list_head *fn_list); + extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*); extern int dccp_insert_option_elapsed_time(struct sock *sk, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 0ce84ea89119..b623f6b25482 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -545,6 +545,7 @@ out: static void dccp_v4_reqsk_destructor(struct request_sock *req) { + dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); kfree(inet_rsk(req)->opt); } diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 33e8a1ea3041..ad6212e00435 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -302,6 +302,7 @@ done: static void dccp_v6_reqsk_destructor(struct request_sock *req) { + dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); if (inet6_rsk(req)->pktopts != NULL) kfree_skb(inet6_rsk(req)->pktopts); } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 1cdf4ae99605..dafcefd86594 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -268,7 +268,7 @@ void dccp_destroy_sock(struct sock *sk) dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; /* clean up feature negotiation state */ - dccp_feat_clean(dmsk); + dccp_feat_list_purge(&dp->dccps_featneg); } EXPORT_SYMBOL_GPL(dccp_destroy_sock); From 5591d286281fdfb57914f5fad3ca001d44ce8fc6 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 012/105] dccp: Limit feature negotiation to connection setup phase This patch starts the new implementation of feature negotiation: 1. Although it is theoretically possible to perform feature negotiation at any time (and RFC 4340 supports this), in practice this is prohibitively complex, as it requires to put traffic on hold for each new negotiation. 2. As a byproduct of restricting feature negotiation to connection setup, the feature-negotiation retransmit timer is no longer required. This part is now mapped onto the protocol-level retransmission. Details indicating why timers are no longer needed can be found on http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/feature_negotiation/\ implementation_notes.html This patch disables anytime negotiation, subsequent patches work out full feature negotiation support for connection setup. Signed-off-by: Gerrit Renker --- net/dccp/feat.c | 19 ++++++++----------- net/dccp/options.c | 18 ------------------ net/dccp/timer.c | 12 ------------ 3 files changed, 8 insertions(+), 41 deletions(-) diff --git a/net/dccp/feat.c b/net/dccp/feat.c index faade82856fe..77ce2f6b0319 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -6,6 +6,8 @@ * * ASSUMPTIONS * ----------- + * o Feature negotiation is coordinated with connection setup (as in TCP), wild + * changes of parameters of an established connection are not supported. * o All currently known SP features have 1-byte quantities. If in the future * extensions of RFCs 4340..42 define features with item lengths larger than * one byte, a feature-specific extension of the code will be required. @@ -652,6 +654,9 @@ int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) { int rc; + /* Ignore Change requests other than during connection setup */ + if (sk->sk_state != DCCP_LISTEN && sk->sk_state != DCCP_REQUESTING) + return 0; dccp_feat_debug(type, feature, *val); /* figure out if it's SP or NN feature */ @@ -701,6 +706,9 @@ int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, int found = 0; int all_confirmed = 1; + /* Ignore Confirm options other than during connection setup */ + if (sk->sk_state != DCCP_LISTEN && sk->sk_state != DCCP_REQUESTING) + return 0; dccp_feat_debug(type, feature, *val); /* locate our change request */ @@ -735,17 +743,6 @@ int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, all_confirmed = 0; } - /* fix re-transmit timer */ - /* XXX gotta make sure that no option negotiation occurs during - * connection shutdown. Consider that the CLOSEREQ is sent and timer is - * on. if all options are confirmed it might kill timer which should - * remain alive until close is received. - */ - if (all_confirmed) { - dccp_pr_debug("clear feat negotiation timer %p\n", sk); - inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); - } - if (!found) dccp_pr_debug("%s(%d, ...) never requested\n", dccp_feat_typename(type), feature); diff --git a/net/dccp/options.c b/net/dccp/options.c index 0809b63cb055..67a171a1268c 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -489,7 +489,6 @@ static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat, static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb) { - struct dccp_sock *dp = dccp_sk(sk); struct dccp_minisock *dmsk = dccp_msk(sk); struct dccp_opt_pend *opt, *next; int change = 0; @@ -530,23 +529,6 @@ static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb) } } - /* Retransmit timer. - * If this is the master listening sock, we don't set a timer on it. It - * should be fine because if the dude doesn't receive our RESPONSE - * [which will contain the CHANGE] he will send another REQUEST which - * will "retrnasmit" the change. - */ - if (change && dp->dccps_role != DCCP_ROLE_LISTEN) { - dccp_pr_debug("reset feat negotiation timer %p\n", sk); - - /* XXX don't reset the timer on re-transmissions. I.e. reset it - * only when sending new stuff i guess. Currently the timer - * never backs off because on re-transmission it just resets it! - */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, DCCP_RTO_MAX); - } - return 0; } diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 54b3c7e9e016..162d1e683c39 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -87,17 +87,6 @@ static void dccp_retransmit_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - /* retransmit timer is used for feature negotiation throughout - * connection. In this case, no packet is re-transmitted, but rather an - * ack is generated and pending changes are placed into its options. - */ - if (sk->sk_send_head == NULL) { - dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk); - if (sk->sk_state == DCCP_OPEN) - dccp_send_ack(sk); - goto backoff; - } - /* * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was * sent, no need to retransmit, this sock is dead. @@ -126,7 +115,6 @@ static void dccp_retransmit_timer(struct sock *sk) return; } -backoff: icsk->icsk_backoff++; icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); From 86349c8d9c6892b57aff4549256ab1aa65aed0f0 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 013/105] dccp: Registration routines for changing feature values Two registration routines, for SP and NN features, are provided by this patch, replacing a previous routine which was used for both feature types. These are internal-only routines and therefore start with `__feat_register'. It further exports the known limits of Sequence Window and Ack Ratio as symbolic constants. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/ccids/ccid2.c | 6 +- net/dccp/feat.c | 123 +++++++++++++++++++++++++++++++++-------- net/dccp/feat.h | 25 ++++++++- net/dccp/proto.c | 2 +- 4 files changed, 128 insertions(+), 28 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 9a430734530c..c9ea19a4d85e 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -25,7 +25,7 @@ /* * This implementation should follow RFC 4341 */ - +#include "../feat.h" #include "../ccid.h" #include "../dccp.h" #include "ccid2.h" @@ -147,8 +147,8 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); val = max_ratio; } - if (val > 0xFFFF) /* RFC 4340, 11.3 */ - val = 0xFFFF; + if (val > DCCPF_ACK_RATIO_MAX) + val = DCCPF_ACK_RATIO_MAX; if (val == dp->dccps_l_ack_ratio) return; diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 77ce2f6b0319..b859722fba72 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -300,6 +300,95 @@ cloning_failed: return -ENOMEM; } +static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) +{ + switch (feat_num) { + case DCCPF_ACK_RATIO: + return val <= DCCPF_ACK_RATIO_MAX; + case DCCPF_SEQUENCE_WINDOW: + return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX; + } + return 0; /* feature unknown - so we can't tell */ +} + +/* check that SP values are within the ranges defined in RFC 4340 */ +static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val) +{ + switch (feat_num) { + case DCCPF_CCID: + return val == DCCPC_CCID2 || val == DCCPC_CCID3; + /* Type-check Boolean feature values: */ + case DCCPF_SHORT_SEQNOS: + case DCCPF_ECN_INCAPABLE: + case DCCPF_SEND_ACK_VECTOR: + case DCCPF_SEND_NDP_COUNT: + case DCCPF_DATA_CHECKSUM: + case DCCPF_SEND_LEV_RATE: + return val < 2; + case DCCPF_MIN_CSUM_COVER: + return val < 16; + } + return 0; /* feature unknown */ +} + +static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) +{ + if (sp_list == NULL || sp_len < 1) + return 0; + while (sp_len--) + if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++)) + return 0; + return 1; +} + +/** + * __feat_register_nn - Register new NN value on socket + * @fn: feature-negotiation list to register with + * @feat: an NN feature from %dccp_feature_numbers + * @mandatory: use Mandatory option if 1 + * @nn_val: value to register (restricted to 4 bytes) + * Note that NN features are local by definition (RFC 4340, 6.3.2). + */ +static int __feat_register_nn(struct list_head *fn, u8 feat, + u8 mandatory, u64 nn_val) +{ + dccp_feat_val fval = { .nn = nn_val }; + + if (dccp_feat_type(feat) != FEAT_NN || + !dccp_feat_is_valid_nn_val(feat, nn_val)) + return -EINVAL; + + /* Don't bother with default values, they will be activated anyway. */ + if (nn_val - (u64)dccp_feat_default_value(feat) == 0) + return 0; + + return dccp_feat_push_change(fn, feat, 1, mandatory, &fval); +} + +/** + * __feat_register_sp - Register new SP value/list on socket + * @fn: feature-negotiation list to register with + * @feat: an SP feature from %dccp_feature_numbers + * @is_local: whether the local (1) or the remote (0) @feat is meant + * @mandatory: use Mandatory option if 1 + * @sp_val: SP value followed by optional preference list + * @sp_len: length of @sp_val in bytes + */ +static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, + u8 mandatory, u8 const *sp_val, u8 sp_len) +{ + dccp_feat_val fval; + + if (dccp_feat_type(feat) != FEAT_SP || + !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) + return -EINVAL; + + if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) + return -ENOMEM; + + return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval); +} + int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, u8 *val, u8 len, gfp_t gfp) { @@ -836,42 +925,30 @@ out_clean: EXPORT_SYMBOL_GPL(dccp_feat_clone); -static int __dccp_feat_init(struct dccp_minisock *dmsk, u8 type, u8 feat, - u8 *val, u8 len) -{ - int rc = -ENOMEM; - u8 *copy = kmemdup(val, len, GFP_KERNEL); - - if (copy != NULL) { - rc = dccp_feat_change(dmsk, type, feat, copy, len, GFP_KERNEL); - if (rc) - kfree(copy); - } - return rc; -} - -int dccp_feat_init(struct dccp_minisock *dmsk) +int dccp_feat_init(struct sock *sk) { + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_minisock *dmsk = dccp_msk(sk); int rc; - INIT_LIST_HEAD(&dmsk->dccpms_pending); - INIT_LIST_HEAD(&dmsk->dccpms_conf); + INIT_LIST_HEAD(&dmsk->dccpms_pending); /* XXX no longer used */ + INIT_LIST_HEAD(&dmsk->dccpms_conf); /* XXX no longer used */ /* CCID L */ - rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_CCID, - &dmsk->dccpms_tx_ccid, 1); + rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 1, 0, + &dmsk->dccpms_tx_ccid, 1); if (rc) goto out; /* CCID R */ - rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_R, DCCPF_CCID, - &dmsk->dccpms_rx_ccid, 1); + rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 0, 0, + &dmsk->dccpms_rx_ccid, 1); if (rc) goto out; /* Ack ratio */ - rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO, - &dmsk->dccpms_ack_ratio, 1); + rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0, + dmsk->dccpms_ack_ratio); out: return rc; } diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 7e953fd0a79b..9eefdb43783e 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -14,6 +14,15 @@ #include #include "dccp.h" +/* + * Known limit values + */ +/* Ack Ratio takes 2-byte integer values (11.3) */ +#define DCCPF_ACK_RATIO_MAX 0xFFFF +/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ +#define DCCPF_SEQ_WMIN 32 +#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull + enum dccp_feat_type { FEAT_AT_RX = 1, /* located at RX side of half-connection */ FEAT_AT_TX = 2, /* located at TX side of half-connection */ @@ -74,6 +83,20 @@ static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry) return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R; } +/** + * struct ccid_dependency - Track changes resulting from choosing a CCID + * @dependent_feat: one of %dccp_feature_numbers + * @is_local: local (1) or remote (0) @dependent_feat + * @is_mandatory: whether presence of @dependent_feat is mission-critical or not + * @val: corresponding default value for @dependent_feat (u8 is sufficient here) + */ +struct ccid_dependency { + u8 dependent_feat; + bool is_local:1, + is_mandatory:1; + u8 val; +}; + #ifdef CONFIG_IP_DCCP_DEBUG extern const char *dccp_feat_typename(const u8 type); extern const char *dccp_feat_name(const u8 feat); @@ -96,6 +119,6 @@ extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, extern void dccp_feat_clean(struct dccp_minisock *dmsk); extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); -extern int dccp_feat_init(struct dccp_minisock *dmsk); +extern int dccp_feat_init(struct sock *sk); #endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index dafcefd86594..01332fe7a99a 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -202,7 +202,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) * setsockopt(CCIDs-I-want/accept). -acme */ if (likely(ctl_sock_initialized)) { - int rc = dccp_feat_init(dmsk); + int rc = dccp_feat_init(sk); if (rc) return rc; From 71bb49596bbf4e5a3328e1704d18604e822ba181 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 014/105] dccp: Query supported CCIDs This provides a data structure to record which CCIDs are locally supported and three accessor functions: - a test function for internal use which is used to validate CCID requests made by the user; - a copy function so that the list can be used for feature-negotiation; - documented getsockopt() support so that the user can query capabilities. The data structure is a table which is filled in at compile-time with the list of available CCIDs (which in turn depends on the Kconfig choices). Using the copy function for cloning the list of supported CCIDs is useful for feature negotiation, since the negotiation is now with the full list of available CCIDs (e.g. {2, 3}) instead of the default value {2}. This means negotiation will not fail if the peer requests to use CCID3 instead of CCID2. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- Documentation/networking/dccp.txt | 4 +++ include/linux/dccp.h | 1 + net/dccp/ccid.c | 48 +++++++++++++++++++++++++++++++ net/dccp/ccid.h | 5 ++++ net/dccp/feat.c | 4 +++ net/dccp/proto.c | 2 ++ 6 files changed, 64 insertions(+) diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 39131a3c78f8..f0aeb20fa63b 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -57,6 +57,10 @@ can be set before calling bind(). DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet size (application payload size) in bytes, see RFC 4340, section 14. +DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs +supported by the endpoint (see include/linux/dccp.h for symbolic constants). +The caller needs to provide a sufficiently large (> 2) array of type uint8_t. + DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold timewait state when closing the connection (RFC 4340, 8.3). The usual case is that the closing server sends a CloseReq, whereupon the client holds timewait diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 484b8a1fb023..d3ac1bde60b4 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -209,6 +209,7 @@ struct dccp_so_feat { #define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 #define DCCP_SOCKOPT_SEND_CSCOV 10 #define DCCP_SOCKOPT_RECV_CSCOV 11 +#define DCCP_SOCKOPT_AVAILABLE_CCIDS 12 #define DCCP_SOCKOPT_CCID_RX_INFO 128 #define DCCP_SOCKOPT_CCID_TX_INFO 192 diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 4809753d12ae..f72ca83df552 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -13,6 +13,13 @@ #include "ccid.h" +static u8 builtin_ccids[] = { + DCCPC_CCID2, /* CCID2 is supported by default */ +#if defined(CONFIG_IP_DCCP_CCID3) || defined(CONFIG_IP_DCCP_CCID3_MODULE) + DCCPC_CCID3, +#endif +}; + static struct ccid_operations *ccids[CCID_MAX]; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) static atomic_t ccids_lockct = ATOMIC_INIT(0); @@ -86,6 +93,47 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab) } } +/* check that up to @array_len members in @ccid_array are supported */ +bool ccid_support_check(u8 const *ccid_array, u8 array_len) +{ + u8 i, j, found; + + for (i = 0, found = 0; i < array_len; i++, found = 0) { + for (j = 0; !found && j < ARRAY_SIZE(builtin_ccids); j++) + found = (ccid_array[i] == builtin_ccids[j]); + if (!found) + return false; + } + return true; +} + +/** + * ccid_get_builtin_ccids - Provide copy of `builtin' CCID array + * @ccid_array: pointer to copy into + * @array_len: value to return length into + * This function allocates memory - caller must see that it is freed after use. + */ +int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) +{ + *ccid_array = kmemdup(builtin_ccids, sizeof(builtin_ccids), gfp_any()); + if (*ccid_array == NULL) + return -ENOBUFS; + *array_len = ARRAY_SIZE(builtin_ccids); + return 0; +} + +int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + if (len < sizeof(builtin_ccids)) + return -EINVAL; + + if (put_user(sizeof(builtin_ccids), optlen) || + copy_to_user(optval, builtin_ccids, sizeof(builtin_ccids))) + return -EFAULT; + return 0; +} + int ccid_register(struct ccid_operations *ccid_ops) { int err = -ENOBUFS; diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index fdeae7b57319..259f5469d7d0 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -103,6 +103,11 @@ static inline void *ccid_priv(const struct ccid *ccid) return (void *)ccid->ccid_priv; } +extern bool ccid_support_check(u8 const *ccid_array, u8 array_len); +extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); +extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, + char __user *, int __user *); + extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index b859722fba72..9399554878cc 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -383,6 +383,10 @@ static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) return -EINVAL; + /* Avoid negotiating alien CCIDs by only advertising supported ones */ + if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) + return -EOPNOTSUPP; + if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) return -ENOMEM; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 01332fe7a99a..b4b10cbd8880 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -649,6 +649,8 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_GET_CUR_MPS: val = dp->dccps_mss_cache; break; + case DCCP_SOCKOPT_AVAILABLE_CCIDS: + return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); case DCCP_SOCKOPT_SERVER_TIMEWAIT: val = dp->dccps_server_timewait; break; From 093e1f46cf162913d05e1d4eeb01baa3e297b683 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 015/105] dccp: Resolve dependencies of features on choice of CCID This provides a missing link in the code chain, as several features implicitly depend and/or rely on the choice of CCID. Most notably, this is the Send Ack Vector feature, but also Ack Ratio and Send Loss Event Rate (also taken care of). For Send Ack Vector, the situation is as follows: * since CCID2 mandates the use of Ack Vectors, there is no point in allowing endpoints which use CCID2 to disable Ack Vector features such a connection; * a peer with a TX CCID of CCID2 will always expect Ack Vectors, and a peer with a RX CCID of CCID2 must always send Ack Vectors (RFC 4341, sec. 4); * for all other CCIDs, the use of (Send) Ack Vector is optional and thus negotiable. However, this implies that the code negotiating the use of Ack Vectors also supports it (i.e. is able to supply and to either parse or ignore received Ack Vectors). Since this is not the case (CCID-3 has no Ack Vector support), the use of Ack Vectors is here disabled, with a comment in the source code. An analogous consideration arises for the Send Loss Event Rate feature, since the CCID-3 implementation does not support the loss interval options of RFC 4342. To make such use explicit, corresponding feature-negotiation options are inserted which signal the use of the loss event rate option, as it is used by the CCID3 code. Lastly, the values of the Ack Ratio feature are matched to the choice of CCID. The patch implements this as a function which is called after the user has made all other registrations for changing default values of features. The table is variable-length, the reserved (and hence for feature-negotiation invalid, confirmed by considering section 19.4 of RFC 4340) feature number `0' is used to mark the end of the table. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/dccp.h | 1 + net/dccp/feat.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++ net/dccp/output.c | 4 ++ net/dccp/proto.c | 3 + 4 files changed, 168 insertions(+) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index dee4a90886d6..1881527bdcd7 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -442,6 +442,7 @@ static inline int dccp_ack_pending(const struct sock *sk) inet_csk_ack_scheduled(sk); } +extern int dccp_feat_finalise_settings(struct dccp_sock *dp); extern void dccp_feat_list_purge(struct list_head *fn_list); extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 9399554878cc..ed9f50b1c34a 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -441,6 +441,166 @@ int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, EXPORT_SYMBOL_GPL(dccp_feat_change); +/* + * Tracking features whose value depend on the choice of CCID + * + * This is designed with an extension in mind so that a list walk could be done + * before activating any features. However, the existing framework was found to + * work satisfactorily up until now, the automatic verification is left open. + * When adding new CCIDs, add a corresponding dependency table here. + */ +static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) +{ + static const struct ccid_dependency ccid2_dependencies[2][2] = { + /* + * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX + * feature and Send Ack Vector is an RX feature, `is_local' + * needs to be reversed. + */ + { /* Dependencies of the receiver-side (remote) CCID2 */ + { + .dependent_feat = DCCPF_SEND_ACK_VECTOR, + .is_local = true, + .is_mandatory = true, + .val = 1 + }, + { 0, 0, 0, 0 } + }, + { /* Dependencies of the sender-side (local) CCID2 */ + { + .dependent_feat = DCCPF_SEND_ACK_VECTOR, + .is_local = false, + .is_mandatory = true, + .val = 1 + }, + { 0, 0, 0, 0 } + } + }; + static const struct ccid_dependency ccid3_dependencies[2][5] = { + { /* + * Dependencies of the receiver-side CCID3 + */ + { /* locally disable Ack Vectors */ + .dependent_feat = DCCPF_SEND_ACK_VECTOR, + .is_local = true, + .is_mandatory = false, + .val = 0 + }, + { /* see below why Send Loss Event Rate is on */ + .dependent_feat = DCCPF_SEND_LEV_RATE, + .is_local = true, + .is_mandatory = true, + .val = 1 + }, + { /* NDP Count is needed as per RFC 4342, 6.1.1 */ + .dependent_feat = DCCPF_SEND_NDP_COUNT, + .is_local = false, + .is_mandatory = true, + .val = 1 + }, + { 0, 0, 0, 0 }, + }, + { /* + * CCID3 at the TX side: we request that the HC-receiver + * will not send Ack Vectors (they will be ignored, so + * Mandatory is not set); we enable Send Loss Event Rate + * (Mandatory since the implementation does not support + * the Loss Intervals option of RFC 4342, 8.6). + * The last two options are for peer's information only. + */ + { + .dependent_feat = DCCPF_SEND_ACK_VECTOR, + .is_local = false, + .is_mandatory = false, + .val = 0 + }, + { + .dependent_feat = DCCPF_SEND_LEV_RATE, + .is_local = false, + .is_mandatory = true, + .val = 1 + }, + { /* this CCID does not support Ack Ratio */ + .dependent_feat = DCCPF_ACK_RATIO, + .is_local = true, + .is_mandatory = false, + .val = 0 + }, + { /* tell receiver we are sending NDP counts */ + .dependent_feat = DCCPF_SEND_NDP_COUNT, + .is_local = true, + .is_mandatory = false, + .val = 1 + }, + { 0, 0, 0, 0 } + } + }; + switch (ccid) { + case DCCPC_CCID2: + return ccid2_dependencies[is_local]; + case DCCPC_CCID3: + return ccid3_dependencies[is_local]; + default: + return NULL; + } +} + +/** + * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID + * @fn: feature-negotiation list to update + * @id: CCID number to track + * @is_local: whether TX CCID (1) or RX CCID (0) is meant + * This function needs to be called after registering all other features. + */ +static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) +{ + const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local); + int i, rc = (table == NULL); + + for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++) + if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP) + rc = __feat_register_sp(fn, table[i].dependent_feat, + table[i].is_local, + table[i].is_mandatory, + &table[i].val, 1); + else + rc = __feat_register_nn(fn, table[i].dependent_feat, + table[i].is_mandatory, + table[i].val); + return rc; +} + +/** + * dccp_feat_finalise_settings - Finalise settings before starting negotiation + * @dp: client or listening socket (settings will be inherited) + * This is called after all registrations (socket initialisation, sysctls, and + * sockopt calls), and before sending the first packet containing Change options + * (ie. client-Request or server-Response), to ensure internal consistency. + */ +int dccp_feat_finalise_settings(struct dccp_sock *dp) +{ + struct list_head *fn = &dp->dccps_featneg; + struct dccp_feat_entry *entry; + int i = 2, ccids[2] = { -1, -1 }; + + /* + * Propagating CCIDs: + * 1) not useful to propagate CCID settings if this host advertises more + * than one CCID: the choice of CCID may still change - if this is + * the client, or if this is the server and the client sends + * singleton CCID values. + * 2) since is that propagate_ccid changes the list, we defer changing + * the sorted list until after the traversal. + */ + list_for_each_entry(entry, fn, node) + if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1) + ccids[entry->is_local] = entry->val.sp.vec[0]; + while (i--) + if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) + return -1; + return 0; +} + static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr) { struct dccp_sock *dp = dccp_sk(sk); diff --git a/net/dccp/output.c b/net/dccp/output.c index d06945c7d3df..dc96ecfbe6e8 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -469,6 +469,10 @@ int dccp_connect(struct sock *sk) struct sk_buff *skb; struct inet_connection_sock *icsk = inet_csk(sk); + /* do not connect if feature negotiation setup fails */ + if (dccp_feat_finalise_settings(dccp_sk(sk))) + return -EPROTO; + dccp_connect_init(sk); skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index b4b10cbd8880..46cb3490d48e 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -278,6 +278,9 @@ static inline int dccp_listen_start(struct sock *sk, int backlog) struct dccp_sock *dp = dccp_sk(sk); dp->dccps_role = DCCP_ROLE_LISTEN; + /* do not start to listen if feature negotiation setup fails */ + if (dccp_feat_finalise_settings(dp)) + return -EPROTO; return inet_csk_listen_start(sk, backlog); } From d4c8741c431e07cfc66eb2b4c3a17b8d4975d9c0 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 016/105] dccp: Mechanism to resolve CCID dependencies This adds a hook to resolve features whose value depends on the choice of CCID. It is done at the server since it can only be done after the CCID values have been negotiated; i.e. the client will add its CCID preference list on the Change options sent in the Request, which will be reconciled with the local preference list of the server. The concept is documented on http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/feature_negotiation/\ implementation_notes.html#ccid_dependencies Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/dccp.h | 1 + net/dccp/feat.c | 25 +++++++++++++++++++++++++ net/dccp/output.c | 13 +++++++++---- 3 files changed, 35 insertions(+), 4 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 1881527bdcd7..e656dafb5d96 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -443,6 +443,7 @@ static inline int dccp_ack_pending(const struct sock *sk) } extern int dccp_feat_finalise_settings(struct dccp_sock *dp); +extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); extern void dccp_feat_list_purge(struct list_head *fn_list); extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index ed9f50b1c34a..6852960bb0a9 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -601,6 +601,31 @@ int dccp_feat_finalise_settings(struct dccp_sock *dp) return 0; } +/** + * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features + * It is the server which resolves the dependencies once the CCID has been + * fully negotiated. If no CCID has been negotiated, it uses the default CCID. + */ +int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) +{ + struct list_head *fn = &dreq->dreq_featneg; + struct dccp_feat_entry *entry; + u8 is_local, ccid; + + for (is_local = 0; is_local <= 1; is_local++) { + entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local); + + if (entry != NULL && !entry->empty_confirm) + ccid = entry->val.sp.vec[0]; + else + ccid = dccp_feat_default_value(DCCPF_CCID); + + if (dccp_feat_propagate_ccid(fn, ccid, is_local)) + return -1; + } + return 0; +} + static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr) { struct dccp_sock *dp = dccp_sk(sk); diff --git a/net/dccp/output.c b/net/dccp/output.c index dc96ecfbe6e8..19a93d5433a7 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -339,10 +339,12 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss; - if (dccp_insert_options_rsk(dreq, skb)) { - kfree_skb(skb); - return NULL; - } + /* Resolve feature dependencies resulting from choice of CCID */ + if (dccp_feat_server_ccid_dependencies(dreq)) + goto response_failed; + + if (dccp_insert_options_rsk(dreq, skb)) + goto response_failed; /* Build and checksum header */ dh = dccp_zeroed_hdr(skb, dccp_header_size); @@ -363,6 +365,9 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, inet_rsk(req)->acked = 1; DCCP_INC_STATS(DCCP_MIB_OUTSEGS); return skb; +response_failed: + kfree_skb(skb); + return NULL; } EXPORT_SYMBOL_GPL(dccp_make_response); From 668144f7b41716a9efe1b398e15ead32a26cd101 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 017/105] dccp: Deprecate old setsockopt framework The previous setsockopt interface, which passed socket options via struct dccp_so_feat, is complicated/difficult to use. Continuing to support it leads to ugly code since the old approach did not distinguish between NN and SP values. This patch removes the old setsockopt interface and replaces it with two new functions to register NN/SP values for feature negotiation. These are essentially wrappers around the internal __feat_register functions, with checking added to avoid * wrong usage (type); * changing values while the connection is in progress. Signed-off-by: Gerrit Renker --- include/linux/dccp.h | 7 ----- net/dccp/feat.c | 72 +++++++++++++++++--------------------------- net/dccp/feat.h | 5 +-- net/dccp/proto.c | 53 ++------------------------------ 4 files changed, 32 insertions(+), 105 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index d3ac1bde60b4..6eaaca9b037a 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -193,13 +193,6 @@ enum dccp_feature_numbers { DCCPF_MAX_CCID_SPECIFIC = 255, }; -/* this structure is argument to DCCP_SOCKOPT_CHANGE_X */ -struct dccp_so_feat { - __u8 dccpsf_feat; - __u8 __user *dccpsf_val; - __u8 dccpsf_len; -}; - /* DCCP socket options */ #define DCCP_SOCKOPT_PACKET_SIZE 1 /* XXX deprecated, without effect */ #define DCCP_SOCKOPT_SERVICE 2 diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 6852960bb0a9..44b10afd3fb6 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -393,53 +393,35 @@ static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval); } -int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, - u8 *val, u8 len, gfp_t gfp) -{ - struct dccp_opt_pend *opt; - - dccp_feat_debug(type, feature, *val); - - if (len > 3) { - DCCP_WARN("invalid length %d\n", len); +/** + * dccp_feat_register_sp - Register requests to change SP feature values + * @sk: client or listening socket + * @feat: one of %dccp_feature_numbers + * @is_local: whether the local (1) or remote (0) @feat is meant + * @list: array of preferred values, in descending order of preference + * @len: length of @list in bytes + */ +int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, + u8 const *list, u8 len) +{ /* any changes must be registered before establishing the connection */ + if (sk->sk_state != DCCP_CLOSED) + return -EISCONN; + if (dccp_feat_type(feat) != FEAT_SP) return -EINVAL; - } - /* XXX add further sanity checks */ - - /* check if that feature is already being negotiated */ - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { - /* ok we found a negotiation for this option already */ - if (opt->dccpop_feat == feature && opt->dccpop_type == type) { - dccp_pr_debug("Replacing old\n"); - /* replace */ - BUG_ON(opt->dccpop_val == NULL); - kfree(opt->dccpop_val); - opt->dccpop_val = val; - opt->dccpop_len = len; - opt->dccpop_conf = 0; - return 0; - } - } - - /* negotiation for a new feature */ - opt = kmalloc(sizeof(*opt), gfp); - if (opt == NULL) - return -ENOMEM; - - opt->dccpop_type = type; - opt->dccpop_feat = feature; - opt->dccpop_len = len; - opt->dccpop_val = val; - opt->dccpop_conf = 0; - opt->dccpop_sc = NULL; - - BUG_ON(opt->dccpop_val == NULL); - - list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending); - return 0; + return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, + 0, list, len); } -EXPORT_SYMBOL_GPL(dccp_feat_change); +/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */ +int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val) +{ + /* any changes must be registered before establishing the connection */ + if (sk->sk_state != DCCP_CLOSED) + return -EISCONN; + if (dccp_feat_type(feat) != FEAT_NN) + return -EINVAL; + return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val); +} /* * Tracking features whose value depend on the choice of CCID @@ -1137,7 +1119,7 @@ int dccp_feat_init(struct sock *sk) /* Ack ratio */ rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0, - dmsk->dccpms_ack_ratio); + dp->dccps_l_ack_ratio); out: return rc; } diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 9eefdb43783e..2c92bd18e5f4 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -110,8 +110,9 @@ static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) #define dccp_feat_debug(type, feat, val) #endif /* CONFIG_IP_DCCP_DEBUG */ -extern int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, - u8 *val, u8 len, gfp_t gfp); +extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, + u8 const *list, u8 len); +extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); extern int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len); extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 46cb3490d48e..108d56bd25c5 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -470,44 +470,6 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service, return 0; } -/* byte 1 is feature. the rest is the preference list */ -static int dccp_setsockopt_change(struct sock *sk, int type, - struct dccp_so_feat __user *optval) -{ - struct dccp_so_feat opt; - u8 *val; - int rc; - - if (copy_from_user(&opt, optval, sizeof(opt))) - return -EFAULT; - /* - * rfc4340: 6.1. Change Options - */ - if (opt.dccpsf_len < 1) - return -EINVAL; - - val = kmalloc(opt.dccpsf_len, GFP_KERNEL); - if (!val) - return -ENOMEM; - - if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) { - rc = -EFAULT; - goto out_free_val; - } - - rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat, - val, opt.dccpsf_len, GFP_KERNEL); - if (rc) - goto out_free_val; - -out: - return rc; - -out_free_val: - kfree(val); - goto out; -} - static int do_dccp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) { @@ -530,20 +492,9 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, err = 0; break; case DCCP_SOCKOPT_CHANGE_L: - if (optlen != sizeof(struct dccp_so_feat)) - err = -EINVAL; - else - err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L, - (struct dccp_so_feat __user *) - optval); - break; case DCCP_SOCKOPT_CHANGE_R: - if (optlen != sizeof(struct dccp_so_feat)) - err = -EINVAL; - else - err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R, - (struct dccp_so_feat __user *) - optval); + DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); + err = 0; break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: if (dp->dccps_role != DCCP_ROLE_SERVER) From 20f41eee82864e308a5499308a1722dc3181cc3a Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 018/105] dccp: Feature negotiation for minimum-checksum-coverage This provides feature negotiation for server minimum checksum coverage which so far has been missing. Since sender/receiver coverage values range only from 0...15, their type has also been reduced in size from u16 to u4. Feature-negotiation options are now generated for both sender and receiver coverage, i.e. when the peer has `forgotten' to enable partial coverage then feature negotiation will automatically enable (negotiate) the partial coverage value for this connection. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- include/linux/dccp.h | 4 ++-- net/dccp/proto.c | 53 +++++++++++++++++++++++++++++++++----------- 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 6eaaca9b037a..5a5a89935dbc 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -527,8 +527,8 @@ struct dccp_sock { __u32 dccps_timestamp_time; __u16 dccps_l_ack_ratio; __u16 dccps_r_ack_ratio; - __u16 dccps_pcslen; - __u16 dccps_pcrlen; + __u8 dccps_pcslen:4; + __u8 dccps_pcrlen:4; __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; struct dccp_minisock dccps_minisock; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 108d56bd25c5..47b137a3feaf 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -470,6 +470,42 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service, return 0; } +static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) +{ + u8 *list, len; + int i, rc; + + if (cscov < 0 || cscov > 15) + return -EINVAL; + /* + * Populate a list of permissible values, in the range cscov...15. This + * is necessary since feature negotiation of single values only works if + * both sides incidentally choose the same value. Since the list starts + * lowest-value first, negotiation will pick the smallest shared value. + */ + if (cscov == 0) + return 0; + len = 16 - cscov; + + list = kmalloc(len, GFP_KERNEL); + if (list == NULL) + return -ENOBUFS; + + for (i = 0; i < len; i++) + list[i] = cscov++; + + rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); + + if (rc == 0) { + if (rx) + dccp_sk(sk)->dccps_pcrlen = cscov; + else + dccp_sk(sk)->dccps_pcslen = cscov; + } + kfree(list); + return rc; +} + static int do_dccp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) { @@ -502,20 +538,11 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, else dp->dccps_server_timewait = (val != 0); break; - case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */ - if (val < 0 || val > 15) - err = -EINVAL; - else - dp->dccps_pcslen = val; + case DCCP_SOCKOPT_SEND_CSCOV: + err = dccp_setsockopt_cscov(sk, val, false); break; - case DCCP_SOCKOPT_RECV_CSCOV: /* receiver side, RFC 4340 sec. 9.2.1 */ - if (val < 0 || val > 15) - err = -EINVAL; - else { - dp->dccps_pcrlen = val; - /* FIXME: add feature negotiation, - * ChangeL(MinimumChecksumCoverage, val) */ - } + case DCCP_SOCKOPT_RECV_CSCOV: + err = dccp_setsockopt_cscov(sk, val, true); break; default: err = -ENOPROTOOPT; From 17c30b40ed79e9f3955e884632c8f01e577b204a Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 019/105] dccp: Deprecate Ack Ratio sysctl This patch deprecates the Ack Ratio sysctl, since * Ack Ratio is entirely ignored by CCID-3 and CCID-4, * Ack Ratio currently doesn't work in CCID-2 (i.e. is always set to 1); * even if it would work in CCID-2, there is no point for a user to change it: - Ack Ratio is constrained by cwnd (RFC 4341, 6.1.2), - if Ack Ratio > cwnd, the system resorts to spurious RTO timeouts (since waiting for Acks which will never arrive in this window), - cwnd is not a user-configurable value. The only reasonable place for Ack Ratio is to print it for debugging. It is planned to do this later on, as part of e.g. dccp_probe. With this patch Ack Ratio is now under full control of feature negotiation: * Ack Ratio is resolved as a dependency of the selected CCID; * if the chosen CCID supports it (i.e. CCID == CCID-2), Ack Ratio is set to the default of 2, following RFC 4340, 11.3 - "New connections start with Ack Ratio 2 for both endpoints"; * what happens then is part of another patch set, since it concerns the dynamic update of Ack Ratio while the connection is in full flight. Thanks to Tomasz Grobelny for discussion leading up to this patch. Signed-off-by: Gerrit Renker Acked-by: Arnaldo Carvalho de Melo --- Documentation/networking/dccp.txt | 3 --- include/linux/dccp.h | 2 -- net/dccp/dccp.h | 1 - net/dccp/minisocks.c | 1 - net/dccp/options.c | 1 - net/dccp/sysctl.c | 7 ------- 6 files changed, 15 deletions(-) diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index f0aeb20fa63b..43df4487379b 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -125,9 +125,6 @@ send_ndp = 1 send_ackvec = 1 Whether or not to send Ack Vector options (sec. 11.5). -ack_ratio = 2 - The default Ack Ratio (sec. 11.3) to use. - tx_ccid = 2 Default CCID for the sender-receiver half-connection. diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 5a5a89935dbc..eda389ce04f4 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -368,7 +368,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * @dccpms_ccid - Congestion Control Id (CCID) (section 10) * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) - * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3) * @dccpms_pending - List of features being negotiated * @dccpms_conf - */ @@ -378,7 +377,6 @@ struct dccp_minisock { __u8 dccpms_tx_ccid; __u8 dccpms_send_ack_vector; __u8 dccpms_send_ndp_count; - __u8 dccpms_ack_ratio; struct list_head dccpms_pending; struct list_head dccpms_conf; }; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index e656dafb5d96..031ce350d3c1 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -98,7 +98,6 @@ extern int sysctl_dccp_retries2; extern int sysctl_dccp_feat_sequence_window; extern int sysctl_dccp_feat_rx_ccid; extern int sysctl_dccp_feat_tx_ccid; -extern int sysctl_dccp_feat_ack_ratio; extern int sysctl_dccp_feat_send_ack_vector; extern int sysctl_dccp_feat_send_ndp_count; extern int sysctl_dccp_tx_qlen; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index e487133ae079..ee7f40f8ddfa 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -47,7 +47,6 @@ void dccp_minisock_init(struct dccp_minisock *dmsk) dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid; dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid; - dmsk->dccpms_ack_ratio = sysctl_dccp_feat_ack_ratio; dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; } diff --git a/net/dccp/options.c b/net/dccp/options.c index 67a171a1268c..515ad45013ad 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -26,7 +26,6 @@ int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; -int sysctl_dccp_feat_ack_ratio = DCCPF_INITIAL_ACK_RATIO; int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index 21295993fdb8..f6e54f433e29 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -40,13 +40,6 @@ static struct ctl_table dccp_default_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "ack_ratio", - .data = &sysctl_dccp_feat_ack_ratio, - .maxlen = sizeof(sysctl_dccp_feat_ack_ratio), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "send_ackvec", .data = &sysctl_dccp_feat_send_ack_vector, From 73bbe095bbb9ce5f94d5475bad54c7ccd8573b1b Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 020/105] dccp: Tidy up setsockopt calls This splits the setsockopt calls into two groups, depending on whether an integer argument (val) is required and whether routines being called do their own locking. Some options (such as setting the CCID) use u8 rather than int, so that for these the test with regard to integer-sizeof can not be used. The second switch-case statement now only has those statements which need locking and which make use of `val'. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Acked-by: Arnaldo Carvalho de Melo Reviewed-by: Eugene Teo --- net/dccp/proto.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 47b137a3feaf..e29bbf914057 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -512,7 +512,17 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, struct dccp_sock *dp = dccp_sk(sk); int val, err = 0; - if (optlen < sizeof(int)) + switch (optname) { + case DCCP_SOCKOPT_PACKET_SIZE: + DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); + return 0; + case DCCP_SOCKOPT_CHANGE_L: + case DCCP_SOCKOPT_CHANGE_R: + DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); + return 0; + } + + if (optlen < (int)sizeof(int)) return -EINVAL; if (get_user(val, (int __user *)optval)) @@ -523,15 +533,6 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, lock_sock(sk); switch (optname) { - case DCCP_SOCKOPT_PACKET_SIZE: - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); - err = 0; - break; - case DCCP_SOCKOPT_CHANGE_L: - case DCCP_SOCKOPT_CHANGE_R: - DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); - err = 0; - break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: if (dp->dccps_role != DCCP_ROLE_SERVER) err = -EOPNOTSUPP; @@ -548,8 +549,8 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, err = -ENOPROTOOPT; break; } - release_sock(sk); + return err; } From fade756f18d42694e3acb00e3471ab43002cba16 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 021/105] dccp: Set per-connection CCIDs via socket options With this patch, TX/RX CCIDs can now be changed on a per-connection basis, which overrides the defaults set by the global sysctl variables for TX/RX CCIDs. To make full use of this facility, the remaining patches of this patch set are needed, which track dependencies and activate negotiated feature values. Note on the maximum number of CCIDs that can be registered: ----------------------------------------------------------- The maximum number of CCIDs that can be registered on the socket is constrained by the space in a Confirm/Change feature negotiation option. The space in these in turn depends on the size of header options as defined in RFC 4340, 5.8. Since this is a recurring constant, it has been moved from ackvec.h into linux/dccp.h, clarifying its purpose. Relative to this size, the maximum number of CCID identifiers that can be present in a Confirm option (which always consumes 1 byte more than a Change option, cf. 6.1) is 2 bytes less than the maximum TLV size: one for the CCID-feature-type and one for the selected value. Signed-off-by: Gerrit Renker --- Documentation/networking/dccp.txt | 14 +++++++++++++ include/linux/dccp.h | 5 +++++ net/dccp/ackvec.c | 9 ++++---- net/dccp/ackvec.h | 5 ++--- net/dccp/feat.h | 2 ++ net/dccp/proto.c | 34 +++++++++++++++++++++++++++++++ 6 files changed, 61 insertions(+), 8 deletions(-) diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 43df4487379b..610083ff73f6 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -61,6 +61,20 @@ DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs supported by the endpoint (see include/linux/dccp.h for symbolic constants). The caller needs to provide a sufficiently large (> 2) array of type uint8_t. +DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same +time, combining the operation of the next two socket options. This option is +preferrable over the latter two, since often applications will use the same +type of CCID for both directions; and mixed use of CCIDs is not currently well +understood. This socket option takes as argument at least one uint8_t value, or +an array of uint8_t values, which must match available CCIDS (see above). CCIDs +must be registered on the socket before calling connect() or listen(). + +DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets +the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. +Please note that the getsockopt argument type here is `int', not uint8_t. + +DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. + DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold timewait state when closing the connection (RFC 4340, 8.3). The usual case is that the closing server sends a CloseReq, whereupon the client holds timewait diff --git a/include/linux/dccp.h b/include/linux/dccp.h index eda389ce04f4..6a72ff52a8a4 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -168,6 +168,8 @@ enum { DCCPO_MIN_CCID_SPECIFIC = 128, DCCPO_MAX_CCID_SPECIFIC = 255, }; +/* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */ +#define DCCP_SINGLE_OPT_MAXLEN 253 /* DCCP CCIDS */ enum { @@ -203,6 +205,9 @@ enum dccp_feature_numbers { #define DCCP_SOCKOPT_SEND_CSCOV 10 #define DCCP_SOCKOPT_RECV_CSCOV 11 #define DCCP_SOCKOPT_AVAILABLE_CCIDS 12 +#define DCCP_SOCKOPT_CCID 13 +#define DCCP_SOCKOPT_TX_CCID 14 +#define DCCP_SOCKOPT_RX_CCID 15 #define DCCP_SOCKOPT_CCID_RX_INFO 128 #define DCCP_SOCKOPT_CCID_TX_INFO 192 diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 1e8be246ad15..01e4d39fa232 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -12,7 +12,6 @@ #include "ackvec.h" #include "dccp.h" -#include #include #include #include @@ -68,7 +67,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) struct dccp_sock *dp = dccp_sk(sk); struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; /* Figure out how many options do we need to represent the ackvec */ - const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN); + const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN); u16 len = av->av_vec_len + 2 * nr_opts, i; u32 elapsed_time; const unsigned char *tail, *from; @@ -100,8 +99,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) for (i = 0; i < nr_opts; ++i) { int copylen = len; - if (len > DCCP_MAX_ACKVEC_OPT_LEN) - copylen = DCCP_MAX_ACKVEC_OPT_LEN; + if (len > DCCP_SINGLE_OPT_MAXLEN) + copylen = DCCP_SINGLE_OPT_MAXLEN; *to++ = DCCPO_ACK_VECTOR_0; *to++ = copylen + 2; @@ -432,7 +431,7 @@ found: int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, u64 *ackno, const u8 opt, const u8 *value, const u8 len) { - if (len > DCCP_MAX_ACKVEC_OPT_LEN) + if (len > DCCP_SINGLE_OPT_MAXLEN) return -1; /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */ diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index bcb64fb4acef..4ccee030524e 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -11,15 +11,14 @@ * published by the Free Software Foundation. */ +#include #include #include #include #include -/* Read about the ECN nonce to see why it is 253 */ -#define DCCP_MAX_ACKVEC_OPT_LEN 253 /* We can spread an ack vector across multiple options */ -#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2) +#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2) #define DCCP_ACKVEC_STATE_RECEIVED 0 #define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 2c92bd18e5f4..b53b11717c40 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -22,6 +22,8 @@ /* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ #define DCCPF_SEQ_WMIN 32 #define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull +/* Maximum number of SP values that fit in a single (Confirm) option */ +#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) enum dccp_feat_type { FEAT_AT_RX = 1, /* located at RX side of half-connection */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index e29bbf914057..2cd56df44d8e 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -506,6 +506,36 @@ static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) return rc; } +static int dccp_setsockopt_ccid(struct sock *sk, int type, + char __user *optval, int optlen) +{ + u8 *val; + int rc = 0; + + if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) + return -EINVAL; + + val = kmalloc(optlen, GFP_KERNEL); + if (val == NULL) + return -ENOMEM; + + if (copy_from_user(val, optval, optlen)) { + kfree(val); + return -EFAULT; + } + + lock_sock(sk); + if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) + rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); + + if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) + rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); + release_sock(sk); + + kfree(val); + return rc; +} + static int do_dccp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) { @@ -520,6 +550,10 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_CHANGE_R: DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); return 0; + case DCCP_SOCKOPT_CCID: + case DCCP_SOCKOPT_RX_CCID: + case DCCP_SOCKOPT_TX_CCID: + return dccp_setsockopt_ccid(sk, optname, optval, optlen); } if (optlen < (int)sizeof(int)) From c8041e264b3db6944d37b87969fbe6458cb30cfd Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 022/105] dccp: API to query the current TX/RX CCID This provides function to query the current TX/RX CCID dynamically, without reliance on the minisock value, using dynamic information available in the currently loaded CCID module. This query function is then used to (a) provide the getsockopt part for getting/setting CCIDs via sockopts; (b) replace the current test for "which CCID is in use" in probe.c. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/ccid.h | 18 ++++++++++++++++++ net/dccp/probe.c | 7 ++----- net/dccp/proto.c | 10 ++++++++++ 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 259f5469d7d0..803343aed004 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -116,6 +116,24 @@ extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk, gfp_t gfp); +static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) +{ + struct ccid *ccid = dp->dccps_hc_rx_ccid; + + if (ccid == NULL || ccid->ccid_ops == NULL) + return -1; + return ccid->ccid_ops->ccid_id; +} + +static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) +{ + struct ccid *ccid = dp->dccps_hc_tx_ccid; + + if (ccid == NULL || ccid->ccid_ops == NULL) + return -1; + return ccid->ccid_ops->ccid_id; +} + extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); diff --git a/net/dccp/probe.c b/net/dccp/probe.c index 81368a7f5379..9ca783d74678 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c @@ -74,14 +74,11 @@ static void printl(const char *fmt, ...) static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size) { - const struct dccp_minisock *dmsk = dccp_msk(sk); const struct inet_sock *inet = inet_sk(sk); - const struct ccid3_hc_tx_sock *hctx; + struct ccid3_hc_tx_sock *hctx = NULL; - if (dmsk->dccpms_tx_ccid == DCCPC_CCID3) + if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) hctx = ccid3_hc_tx_sk(sk); - else - hctx = NULL; if (port == 0 || ntohs(inet->dport) == port || ntohs(inet->sport) == port) { diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 2cd56df44d8e..6550452d59e2 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -667,6 +667,16 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, break; case DCCP_SOCKOPT_AVAILABLE_CCIDS: return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); + case DCCP_SOCKOPT_TX_CCID: + val = ccid_get_current_tx_ccid(dp); + if (val < 0) + return -ENOPROTOOPT; + break; + case DCCP_SOCKOPT_RX_CCID: + val = ccid_get_current_rx_ccid(dp); + if (val < 0) + return -ENOPROTOOPT; + break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: val = dp->dccps_server_timewait; break; From b9aaac1c538a9c86e8ef3be2579a13ff55580908 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 023/105] dccp: Increase the scope of variable-length htonl/ntohl functions This extends the scope of two available functions, encode|decode_value_var, to work up to 6 (8) bytes, to match maximum requirements in the RFC. These functions are going to be used both by general option processing and feature negotiation code, hence declarations have been put into feat.h. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Acked-by: Arnaldo Carvalho de Melo --- net/dccp/feat.h | 14 ++++++++++++++ net/dccp/options.c | 21 ++++++++++++++------- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/net/dccp/feat.h b/net/dccp/feat.h index b53b11717c40..90d16caf5457 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -124,4 +124,18 @@ extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); extern int dccp_feat_init(struct sock *sk); +/* + * Encoding variable-length options and their maximum length. + * + * This affects NN options (SP options are all u8) and other variable-length + * options (see table 3 in RFC 4340). The limit is currently given the Sequence + * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other + * options consume less than 6 bytes (timestamps are 4 bytes). + * When updating this constant (e.g. due to new internet drafts / RFCs), make + * sure that you also update all code which refers to it. + */ +#define DCCP_OPTVAL_MAXLEN 6 + +extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); +extern u64 dccp_decode_value_var(const u8 *bf, const u8 len); #endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/options.c b/net/dccp/options.c index 515ad45013ad..9cb0ff894052 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -29,16 +29,20 @@ int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; -static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len) +u64 dccp_decode_value_var(const u8 *bf, const u8 len) { - u32 value = 0; + u64 value = 0; + if (len >= DCCP_OPTVAL_MAXLEN) + value += ((u64)*bf++) << 40; + if (len > 4) + value += ((u64)*bf++) << 32; if (len > 3) - value += *bf++ << 24; + value += ((u64)*bf++) << 24; if (len > 2) - value += *bf++ << 16; + value += ((u64)*bf++) << 16; if (len > 1) - value += *bf++ << 8; + value += ((u64)*bf++) << 8; if (len > 0) value += *bf; @@ -298,9 +302,12 @@ out_invalid_option: EXPORT_SYMBOL_GPL(dccp_parse_options); -static void dccp_encode_value_var(const u32 value, unsigned char *to, - const unsigned int len) +void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) { + if (len >= DCCP_OPTVAL_MAXLEN) + *to++ = (value & 0xFF0000000000ull) >> 40; + if (len > 4) + *to++ = (value & 0xFF00000000ull) >> 32; if (len > 3) *to++ = (value & 0xFF000000) >> 24; if (len > 2) From d0440ee6f6903fcde6ed4efb88c910de1dfa18e5 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 024/105] dccp: Support for Mandatory options Support for Mandatory options is provided by this patch, which will be used by subsequent feature-negotiation patches. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Acked-by: Arnaldo Carvalho de Melo --- net/dccp/feat.h | 2 ++ net/dccp/options.c | 15 +++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 90d16caf5457..602e0a7294ba 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -138,4 +138,6 @@ extern int dccp_feat_init(struct sock *sk); extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); extern u64 dccp_decode_value_var(const u8 *bf, const u8 len); + +extern int dccp_insert_option_mandatory(struct sk_buff *skb); #endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/options.c b/net/dccp/options.c index 9cb0ff894052..676d53065de9 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -467,6 +467,21 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, return 0; } +/** + * dccp_insert_option_mandatory - Mandatory option (5.8.2) + * Note that since we are using skb_push, this function needs to be called + * _after_ inserting the option it is supposed to influence (stack order). + */ +int dccp_insert_option_mandatory(struct sk_buff *skb) +{ + if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN) + return -1; + + DCCP_SKB_CB(skb)->dccpd_opt_len++; + *skb_push(skb, 1) = DCCPO_MANDATORY; + return 0; +} + static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat, u8 *val, u8 len) { From cf9ddf73b9ba21a5cd6d3fcb0a45cfa9ec452033 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 025/105] dccp: Header option insertion routine for feature-negotiation The patch extends existing code: * Confirm options divide into the confirmed value plus an optional preference list for SP values. Previously only the preference list was echoed for SP values, now the confirmed value is added as per RFC 4340, 6.1; * length and sanity checks are added to avoid illegal memory (or NULL) access. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/feat.h | 2 + net/dccp/options.c | 91 ++++++++++++++++------------------------------ 2 files changed, 33 insertions(+), 60 deletions(-) diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 602e0a7294ba..8e863839ed91 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -140,4 +140,6 @@ extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); extern u64 dccp_decode_value_var(const u8 *bf, const u8 len); extern int dccp_insert_option_mandatory(struct sk_buff *skb); +extern int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, + u8 *val, u8 len, bool repeat_first); #endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/options.c b/net/dccp/options.c index 676d53065de9..bfa1cb8f3ef1 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -482,23 +482,46 @@ int dccp_insert_option_mandatory(struct sk_buff *skb) return 0; } -static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat, - u8 *val, u8 len) +/** + * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb + * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R + * @feat: one out of %dccp_feature_numbers + * @val: NN value or SP array (preferred element first) to copy + * @len: true length of @val in bytes (excluding first element repetition) + * @repeat_first: whether to copy the first element of @val twice + * The last argument is used to construct Confirm options, where the preferred + * value and the preference list appear separately (RFC 4340, 6.3.1). Preference + * lists are kept such that the preferred entry is always first, so we only need + * to copy twice, and avoid the overhead of cloning into a bigger array. + */ +int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, + u8 *val, u8 len, bool repeat_first) { - u8 *to; + u8 tot_len, *to; - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) { - DCCP_WARN("packet too small for feature %d option!\n", feat); + /* take the `Feature' field and possible repetition into account */ + if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { + DCCP_WARN("length %u for feature %u too large\n", len, feat); return -1; } - DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3; + if (unlikely(val == NULL || len == 0)) + len = repeat_first = 0; + tot_len = 3 + repeat_first + len; - to = skb_push(skb, len + 3); + if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { + DCCP_WARN("packet too small for feature %d option!\n", feat); + return -1; + } + DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len; + + to = skb_push(skb, tot_len); *to++ = type; - *to++ = len + 3; + *to++ = tot_len; *to++ = feat; + if (repeat_first) + *to++ = *val; if (len) memcpy(to, val, len); @@ -508,51 +531,6 @@ static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat, return 0; } -static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_minisock *dmsk = dccp_msk(sk); - struct dccp_opt_pend *opt, *next; - int change = 0; - - /* confirm any options [NN opts] */ - list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { - dccp_insert_feat_opt(skb, opt->dccpop_type, - opt->dccpop_feat, opt->dccpop_val, - opt->dccpop_len); - /* fear empty confirms */ - if (opt->dccpop_val) - kfree(opt->dccpop_val); - kfree(opt); - } - INIT_LIST_HEAD(&dmsk->dccpms_conf); - - /* see which features we need to send */ - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { - /* see if we need to send any confirm */ - if (opt->dccpop_sc) { - dccp_insert_feat_opt(skb, opt->dccpop_type + 1, - opt->dccpop_feat, - opt->dccpop_sc->dccpoc_val, - opt->dccpop_sc->dccpoc_len); - - BUG_ON(!opt->dccpop_sc->dccpoc_val); - kfree(opt->dccpop_sc->dccpoc_val); - kfree(opt->dccpop_sc); - opt->dccpop_sc = NULL; - } - - /* any option not confirmed, re-send it */ - if (!opt->dccpop_conf) { - dccp_insert_feat_opt(skb, opt->dccpop_type, - opt->dccpop_feat, opt->dccpop_val, - opt->dccpop_len); - change++; - } - } - - return 0; -} - /* The length of all options needs to be a multiple of 4 (5.8) */ static void dccp_insert_option_padding(struct sk_buff *skb) { @@ -589,13 +567,6 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) dp->dccps_hc_rx_insert_options = 0; } - /* Feature negotiation */ - /* Data packets can't do feat negotiation */ - if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA && - DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATAACK && - dccp_insert_options_feat(sk, skb)) - return -1; - /* * Obtain RTT sample from Request/Response exchange. * This is currently used in CCID 3 initialisation. From 0ef118a017919cd661cf294811d1889ac556ee80 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 026/105] dccp: Insert feature-negotiation options into skb This patch replaces the earlier insertion routine from options.c, so that code specific to feature negotiation can remain in feat.c. This is possible by calling a function already existing in options.c. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/dccp.h | 2 ++ net/dccp/feat.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 031ce350d3c1..2e2a6f229eaf 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -443,6 +443,8 @@ static inline int dccp_ack_pending(const struct sock *sk) extern int dccp_feat_finalise_settings(struct dccp_sock *dp); extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); +extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, + struct sk_buff *skb); extern void dccp_feat_list_purge(struct list_head *fn_list); extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 44b10afd3fb6..da686464462d 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -300,6 +300,20 @@ cloning_failed: return -ENOMEM; } +/** + * dccp_feat_valid_nn_length - Enforce length constraints on NN options + * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, + * incoming options are accepted as long as their values are valid. + */ +static u8 dccp_feat_valid_nn_length(u8 feat_num) +{ + if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */ + return 2; + if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */ + return 6; + return 0; +} + static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) { switch (feat_num) { @@ -341,6 +355,57 @@ static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) return 1; } +/** + * dccp_feat_insert_opts - Generate FN options from current list state + * @skb: next sk_buff to be sent to the peer + * @dp: for client during handshake and general negotiation + * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND) + */ +int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, + struct sk_buff *skb) +{ + struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; + struct dccp_feat_entry *pos, *next; + u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN]; + bool rpt; + + /* put entries into @skb in the order they appear in the list */ + list_for_each_entry_safe_reverse(pos, next, fn, node) { + opt = dccp_feat_genopt(pos); + type = dccp_feat_type(pos->feat_num); + rpt = false; + + if (pos->empty_confirm) { + len = 0; + ptr = NULL; + } else { + if (type == FEAT_SP) { + len = pos->val.sp.len; + ptr = pos->val.sp.vec; + rpt = pos->needs_confirm; + } else if (type == FEAT_NN) { + len = dccp_feat_valid_nn_length(pos->feat_num); + ptr = nn_in_nbo; + dccp_encode_value_var(pos->val.nn, ptr, len); + } else { + DCCP_BUG("unknown feature %u", pos->feat_num); + return -1; + } + } + + if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) + return -1; + if (pos->needs_mandatory && dccp_insert_option_mandatory(skb)) + return -1; + /* + * Enter CHANGING after transmitting the Change option (6.6.2). + */ + if (pos->state == FEAT_INITIALISING) + pos->state = FEAT_CHANGING; + } + return 0; +} + /** * __feat_register_nn - Register new NN value on socket * @fn: feature-negotiation list to register with From f8a644c07e6f38b2c3cbaf99990e867d670d207b Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 027/105] dccp: Integrate feature-negotiation insertion code The patch implements insertion of feature negotiation at the server (listening and request socket) and the client (connecting socket). In dccp_insert_options(), several statements have been grouped together now to achieve (I hope) better efficiency by reducing the number of tests each packet has to go through: - Ack Vectors are sent if the packet is neither a Data or a Request packet; - a previous issue is corrected - feature negotiation options are allowed on DataAck packets (5.8). Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/options.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/net/dccp/options.c b/net/dccp/options.c index bfa1cb8f3ef1..0e277114cb23 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -554,11 +554,25 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) dccp_insert_option_ndp(sk, skb)) return -1; - if (!dccp_packet_without_ack(skb)) { - if (dmsk->dccpms_send_ack_vector && - dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && - dccp_insert_option_ackvec(sk, skb)) + if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { + + /* Feature Negotiation */ + if (dccp_feat_insert_opts(dp, NULL, skb)) return -1; + + if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) { + /* + * Obtain RTT sample from Request/Response exchange. + * This is currently used in CCID 3 initialisation. + */ + if (dccp_insert_option_timestamp(sk, skb)) + return -1; + + } else if (dmsk->dccpms_send_ack_vector && + dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && + dccp_insert_option_ackvec(sk, skb)) { + return -1; + } } if (dp->dccps_hc_rx_insert_options) { @@ -567,14 +581,6 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) dp->dccps_hc_rx_insert_options = 0; } - /* - * Obtain RTT sample from Request/Response exchange. - * This is currently used in CCID 3 initialisation. - */ - if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST && - dccp_insert_option_timestamp(sk, skb)) - return -1; - if (dp->dccps_timestamp_echo != 0 && dccp_insert_option_timestamp_echo(dp, NULL, skb)) return -1; @@ -587,6 +593,9 @@ int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) { DCCP_SKB_CB(skb)->dccpd_opt_len = 0; + if (dccp_feat_insert_opts(NULL, dreq, skb)) + return -1; + if (dreq->dreq_timestamp_echo != 0 && dccp_insert_option_timestamp_echo(NULL, dreq, skb)) return -1; From c664d4f4e2963ee355b1b0e77461eb844d1b288d Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 028/105] dccp: Preference list reconciliation This provides two functions to * reconcile preference lists (with appropriate return codes) and * reorder the preference list if successful reconciliation changed the preferred value. The patch also removes the old code for processing SP/NN Change options, since new code to process these is mostly there already; related references have been commented out. The code for processing Change options follows in the next patch. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/feat.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/net/dccp/feat.c b/net/dccp/feat.c index da686464462d..d53077bd40bf 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -718,6 +718,76 @@ static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val) return 0; } +/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ +static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) +{ + u8 c, s; + + for (s = 0; s < slen; s++) + for (c = 0; c < clen; c++) + if (servlist[s] == clilist[c]) + return servlist[s]; + return -1; +} + +/** + * dccp_feat_prefer - Move preferred entry to the start of array + * Reorder the @array_len elements in @array so that @preferred_value comes + * first. Returns >0 to indicate that @preferred_value does occur in @array. + */ +static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) +{ + u8 i, does_occur = 0; + + if (array != NULL) { + for (i = 0; i < array_len; i++) + if (array[i] == preferred_value) { + array[i] = array[0]; + does_occur++; + } + if (does_occur) + array[0] = preferred_value; + } + return does_occur; +} + +/** + * dccp_feat_reconcile - Reconcile SP preference lists + * @fval: SP list to reconcile into + * @arr: received SP preference list + * @len: length of @arr in bytes + * @is_server: whether this side is the server (and @fv is the server's list) + * @reorder: whether to reorder the list in @fv after reconciling with @arr + * When successful, > 0 is returned and the reconciled list is in @fval. + * A value of 0 means that negotiation failed (no shared entry). + */ +static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, + bool is_server, bool reorder) +{ + int rc; + + if (!fv->sp.vec || !arr) { + DCCP_CRIT("NULL feature value or array"); + return 0; + } + + if (is_server) + rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len); + else + rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len); + + if (!reorder) + return rc; + if (rc < 0) + return 0; + + /* + * Reorder list: used for activating features and in dccp_insert_fn_opt. + */ + return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); +} + +#ifdef __this_is_the_old_framework_and_will_be_removed_later_in_a_subsequent_patch static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt, u8 *rpref, u8 rlen) { @@ -913,6 +983,7 @@ static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) return 0; } +#endif /* (later) */ static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk, u8 type, u8 feature) @@ -988,12 +1059,14 @@ int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) switch (feature) { /* deal with SP features */ case DCCPF_CCID: - rc = dccp_feat_sp(sk, type, feature, val, len); + /* XXX Obsoleted by next patch + rc = dccp_feat_sp(sk, type, feature, val, len); */ break; /* deal with NN features */ case DCCPF_ACK_RATIO: - rc = dccp_feat_nn(sk, type, feature, val, len); + /* XXX Obsoleted by next patch + rc = dccp_feat_nn(sk, type, feature, val, len); */ break; /* XXX implement other features */ From 5a146b97d5e93db2df075c0d820f492bb996d0e3 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 029/105] dccp: Process incoming Change feature-negotiation options This adds/replaces code for processing incoming ChangeL/R options. The main difference is that: * mandatory FN options are now interpreted inside the function (there are too many individual cases to do this externally); * the function returns an appropriate Reset code or 0, which is then used to fill in the data for the Reset packet. Old code, which is no longer used or referenced, has been removed. Signed-off-by: Gerrit Renker --- net/dccp/feat.c | 146 ++++++++++++++++++++++++++++++++++++++++++++- net/dccp/feat.h | 4 +- net/dccp/options.c | 23 +++---- 3 files changed, 155 insertions(+), 18 deletions(-) diff --git a/net/dccp/feat.c b/net/dccp/feat.c index d53077bd40bf..01b4da7007b2 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -983,7 +983,6 @@ static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) return 0; } -#endif /* (later) */ static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk, u8 type, u8 feature) @@ -1094,6 +1093,7 @@ int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) } EXPORT_SYMBOL_GPL(dccp_feat_change_recv); +#endif /* (later) */ int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) @@ -1234,6 +1234,150 @@ out_clean: EXPORT_SYMBOL_GPL(dccp_feat_clone); +/** + * dccp_feat_change_recv - Process incoming ChangeL/R options + * @fn: feature-negotiation list to update + * @is_mandatory: whether the Change was preceded by a Mandatory option + * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R + * @feat: one of %dccp_feature_numbers + * @val: NN value or SP value/preference list + * @len: length of @val in bytes + * @server: whether this node is the server (1) or the client (0) + */ +static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, + u8 feat, u8 *val, u8 len, const bool server) +{ + u8 defval, type = dccp_feat_type(feat); + const bool local = (opt == DCCPO_CHANGE_R); + struct dccp_feat_entry *entry; + dccp_feat_val fval; + + if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ + goto unknown_feature_or_value; + + /* + * Negotiation of NN features: Change R is invalid, so there is no + * simultaneous negotiation; hence we do not look up in the list. + */ + if (type == FEAT_NN) { + if (local || len > sizeof(fval.nn)) + goto unknown_feature_or_value; + + /* 6.3.2: "The feature remote MUST accept any valid value..." */ + fval.nn = dccp_decode_value_var(val, len); + if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) + goto unknown_feature_or_value; + + return dccp_feat_push_confirm(fn, feat, local, &fval); + } + + /* + * Unidirectional/simultaneous negotiation of SP features (6.3.1) + */ + entry = dccp_feat_list_lookup(fn, feat, local); + if (entry == NULL) { + /* + * No particular preferences have been registered. We deal with + * this situation by assuming that all valid values are equally + * acceptable, and apply the following checks: + * - if the peer's list is a singleton, we accept a valid value; + * - if we are the server, we first try to see if the peer (the + * client) advertises the default value. If yes, we use it, + * otherwise we accept the preferred value; + * - else if we are the client, we use the first list element. + */ + if (dccp_feat_clone_sp_val(&fval, val, 1)) + return DCCP_RESET_CODE_TOO_BUSY; + + if (len > 1 && server) { + defval = dccp_feat_default_value(feat); + if (dccp_feat_preflist_match(&defval, 1, val, len) > -1) + fval.sp.vec[0] = defval; + } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) { + kfree(fval.sp.vec); + goto unknown_feature_or_value; + } + + /* Treat unsupported CCIDs like invalid values */ + if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) { + kfree(fval.sp.vec); + goto not_valid_or_not_known; + } + + return dccp_feat_push_confirm(fn, feat, local, &fval); + + } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ + return 0; + } + + if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { + entry->empty_confirm = 0; + } else if (is_mandatory) { + return DCCP_RESET_CODE_MANDATORY_ERROR; + } else if (entry->state == FEAT_INITIALISING) { + /* + * Failed simultaneous negotiation (server only): try to `save' + * the connection by checking whether entry contains the default + * value for @feat. If yes, send an empty Confirm to signal that + * the received Change was not understood - which implies using + * the default value. + * If this also fails, we use Reset as the last resort. + */ + WARN_ON(!server); + defval = dccp_feat_default_value(feat); + if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true)) + return DCCP_RESET_CODE_OPTION_ERROR; + entry->empty_confirm = 1; + } + entry->needs_confirm = 1; + entry->needs_mandatory = 0; + entry->state = FEAT_STABLE; + return 0; + +unknown_feature_or_value: + if (!is_mandatory) + return dccp_push_empty_confirm(fn, feat, local); + +not_valid_or_not_known: + return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR + : DCCP_RESET_CODE_OPTION_ERROR; +} + +/** + * dccp_feat_parse_options - Process Feature-Negotiation Options + * @sk: for general use and used by the client during connection setup + * @dreq: used by the server during connection setup + * @mandatory: whether @opt was preceded by a Mandatory option + * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R + * @feat: one of %dccp_feature_numbers + * @val: value contents of @opt + * @len: length of @val in bytes + * Returns 0 on success, a Reset code for ending the connection otherwise. + */ +int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, + u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; + bool server = false; + + switch (sk->sk_state) { + /* + * Negotiation during connection setup + */ + case DCCP_LISTEN: + server = true; /* fall through */ + case DCCP_REQUESTING: + switch (opt) { + case DCCPO_CHANGE_L: + case DCCPO_CHANGE_R: + return dccp_feat_change_recv(fn, mandatory, opt, feat, + val, len, server); + } + } + return 0; /* ignore FN options in all other states */ +} + int dccp_feat_init(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 8e863839ed91..ce97f3fe7687 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -115,8 +115,8 @@ static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, u8 const *list, u8 len); extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); -extern int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, - u8 *val, u8 len); +extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, + u8 mand, u8 opt, u8 feat, u8 *val, u8 len); extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len); extern void dccp_feat_clean(struct dccp_minisock *dmsk); diff --git a/net/dccp/options.c b/net/dccp/options.c index 0e277114cb23..fb8466ea467a 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -135,22 +135,13 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, (unsigned long long)opt_recv->dccpor_ndp); break; case DCCPO_CHANGE_L: - /* fall through */ case DCCPO_CHANGE_R: if (pkt_type == DCCP_PKT_DATA) break; - if (len < 2) - goto out_invalid_option; - rc = dccp_feat_change_recv(sk, opt, *value, value + 1, - len - 1); - /* - * When there is a change error, change_recv is - * responsible for dealing with it. i.e. reply with an - * empty confirm. - * If the change was mandatory, then we need to die. - */ - if (rc && mandatory) - goto out_invalid_option; + rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, + *value, value + 1, len - 1); + if (rc) + goto out_featneg_failed; break; case DCCPO_CONFIRM_L: /* fall through */ @@ -292,8 +283,10 @@ out_nonsensical_length: out_invalid_option: DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR; - DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len); + rc = DCCP_RESET_CODE_OPTION_ERROR; +out_featneg_failed: + DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); + DCCP_SKB_CB(skb)->dccpd_reset_code = rc; DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; From d2150b7bff3d397692cf0dc890f198d23564de5f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 030/105] dccp: Processing Confirm options Analogous to the previous patch, this adds code to interpret incoming Confirm feature-negotiation options. Both functions operate on the feature-negotiation list of either the request_sock (server) or the dccp_sock (client). Thanks to Wei Yongjun for pointing out that it is overly restrictive to check the entire list of confirmed SP values. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/feat.c | 100 ++++++++++++++++++++++++++++++++++++++++++++- net/dccp/feat.h | 2 - net/dccp/options.c | 16 +------- 3 files changed, 101 insertions(+), 17 deletions(-) diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 01b4da7007b2..da3bbad9d50d 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -99,6 +99,13 @@ static int dccp_feat_default_value(u8 feat_num) return idx < 0 ? : dccp_feat_table[idx].default_value; } +/* Test for "Req'd" feature (RFC 4340, 6.4) */ +static inline int dccp_feat_must_be_understood(u8 feat_num) +{ + return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS || + feat_num == DCCPF_SEQUENCE_WINDOW; +} + /* copy constructor, fval must not already contain allocated memory */ static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) { @@ -1093,7 +1100,6 @@ int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) } EXPORT_SYMBOL_GPL(dccp_feat_change_recv); -#endif /* (later) */ int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) @@ -1148,6 +1154,7 @@ int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, } EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv); +#endif /* (later) */ void dccp_feat_clean(struct dccp_minisock *dmsk) { @@ -1343,6 +1350,93 @@ not_valid_or_not_known: : DCCP_RESET_CODE_OPTION_ERROR; } +/** + * dccp_feat_confirm_recv - Process received Confirm options + * @fn: feature-negotiation list to update + * @is_mandatory: whether @opt was preceded by a Mandatory option + * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R + * @feat: one of %dccp_feature_numbers + * @val: NN value or SP value/preference list + * @len: length of @val in bytes + * @server: whether this node is server (1) or client (0) + */ +static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, + u8 feat, u8 *val, u8 len, const bool server) +{ + u8 *plist, plen, type = dccp_feat_type(feat); + const bool local = (opt == DCCPO_CONFIRM_R); + struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); + + if (entry == NULL) { /* nothing queued: ignore or handle error */ + if (is_mandatory && type == FEAT_UNKNOWN) + return DCCP_RESET_CODE_MANDATORY_ERROR; + + if (!local && type == FEAT_NN) /* 6.3.2 */ + goto confirmation_failed; + return 0; + } + + if (entry->state != FEAT_CHANGING) /* 6.6.2 */ + return 0; + + if (len == 0) { + if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */ + goto confirmation_failed; + /* + * Empty Confirm during connection setup: this means reverting + * to the `old' value, which in this case is the default. Since + * we handle default values automatically when no other values + * have been set, we revert to the old value by removing this + * entry from the list. + */ + dccp_feat_list_pop(entry); + return 0; + } + + if (type == FEAT_NN) { + if (len > sizeof(entry->val.nn)) + goto confirmation_failed; + + if (entry->val.nn == dccp_decode_value_var(val, len)) + goto confirmation_succeeded; + + DCCP_WARN("Bogus Confirm for non-existing value\n"); + goto confirmation_failed; + } + + /* + * Parsing SP Confirms: the first element of @val is the preferred + * SP value which the peer confirms, the remainder depends on @len. + * Note that only the confirmed value need to be a valid SP value. + */ + if (!dccp_feat_is_valid_sp_val(feat, *val)) + goto confirmation_failed; + + if (len == 1) { /* peer didn't supply a preference list */ + plist = val; + plen = len; + } else { /* preferred value + preference list */ + plist = val + 1; + plen = len - 1; + } + + /* Check whether the peer got the reconciliation right (6.6.8) */ + if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { + DCCP_WARN("Confirm selected the wrong value %u\n", *val); + return DCCP_RESET_CODE_OPTION_ERROR; + } + entry->val.sp.vec[0] = *val; + +confirmation_succeeded: + entry->state = FEAT_STABLE; + return 0; + +confirmation_failed: + DCCP_WARN("Confirmation failed\n"); + return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR + : DCCP_RESET_CODE_OPTION_ERROR; +} + /** * dccp_feat_parse_options - Process Feature-Negotiation Options * @sk: for general use and used by the client during connection setup @@ -1373,6 +1467,10 @@ int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, case DCCPO_CHANGE_R: return dccp_feat_change_recv(fn, mandatory, opt, feat, val, len, server); + case DCCPO_CONFIRM_R: + case DCCPO_CONFIRM_L: + return dccp_feat_confirm_recv(fn, mandatory, opt, feat, + val, len, server); } } return 0; /* ignore FN options in all other states */ diff --git a/net/dccp/feat.h b/net/dccp/feat.h index ce97f3fe7687..618bed935b33 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -117,8 +117,6 @@ extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, u8 mand, u8 opt, u8 feat, u8 *val, u8 len); -extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, - u8 *val, u8 len); extern void dccp_feat_clean(struct dccp_minisock *dmsk); extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); diff --git a/net/dccp/options.c b/net/dccp/options.c index fb8466ea467a..3a9a22f0ac1a 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -134,26 +134,14 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), (unsigned long long)opt_recv->dccpor_ndp); break; - case DCCPO_CHANGE_L: - case DCCPO_CHANGE_R: - if (pkt_type == DCCP_PKT_DATA) + case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: + if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ break; rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, *value, value + 1, len - 1); if (rc) goto out_featneg_failed; break; - case DCCPO_CONFIRM_L: - /* fall through */ - case DCCPO_CONFIRM_R: - if (pkt_type == DCCP_PKT_DATA) - break; - if (len < 2) /* FIXME this disallows empty confirm */ - goto out_invalid_option; - if (dccp_feat_confirm_recv(sk, opt, *value, - value + 1, len - 1)) - goto out_invalid_option; - break; case DCCPO_ACK_VECTOR_0: case DCCPO_ACK_VECTOR_1: if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ From c926c6aed3e444e8c88a768f063b2de8fd6ae760 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 031/105] dccp: Feature activation handlers This patch provides the post-processing of feature negotiation state, after the negotiation has completed. To this purpose, handlers are used and added to the dccp_feat_table. Each handler is passed a boolean flag whether the RX or TX side of the feature is meant. Several handlers are provided already, new handlers can easily be added. The initialisation is now fully dynamic, i.e. CCIDs are activated only after the feature negotiation. The integration of this dynamic activation is done in the subsequent patches. Thanks to Wei Yongjun for pointing out the necessity of skipping over empty Confirm options while copying the negotiated feature values. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/dccp.h | 1 + net/dccp/feat.c | 213 +++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 204 insertions(+), 10 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 2e2a6f229eaf..1baed789bcc2 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -445,6 +445,7 @@ extern int dccp_feat_finalise_settings(struct dccp_sock *dp); extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, struct sk_buff *skb); +extern int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); extern void dccp_feat_list_purge(struct list_head *fn_list); extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index da3bbad9d50d..f78bd357b5b1 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -25,11 +25,101 @@ #define DCCP_FEAT_SP_NOAGREE (-123) +/* + * Feature activation handlers. + * + * These all use an u64 argument, to provide enough room for NN/SP features. At + * this stage the negotiated values have been checked to be within their range. + */ +static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct ccid *new_ccid = ccid_new(ccid, sk, rx, gfp_any()); + + if (new_ccid == NULL) + return -ENOMEM; + + if (rx) { + ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); + dp->dccps_hc_rx_ccid = new_ccid; + } else { + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_tx_ccid = new_ccid; + } + return 0; +} + +static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) +{ + if (!rx) + dccp_msk(sk)->dccpms_sequence_window = seq_win; + return 0; +} + +static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) +{ + if (rx) + dccp_sk(sk)->dccps_r_ack_ratio = ratio; + else + dccp_sk(sk)->dccps_l_ack_ratio = ratio; + return 0; +} + +static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (rx) { + if (enable && dp->dccps_hc_rx_ackvec == NULL) { + dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); + if (dp->dccps_hc_rx_ackvec == NULL) + return -ENOMEM; + } else if (!enable) { + dccp_ackvec_free(dp->dccps_hc_rx_ackvec); + dp->dccps_hc_rx_ackvec = NULL; + } + } + return 0; +} + +static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) +{ + if (!rx) + dccp_msk(sk)->dccpms_send_ndp_count = (enable > 0); + return 0; +} + +/* + * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that + * `rx' holds when the sending peer informs about his partial coverage via a + * ChangeR() option. In the other case, we are the sender and the receiver + * announces its coverage via ChangeL() options. The policy here is to honour + * such communication by enabling the corresponding partial coverage - but only + * if it has not been set manually before; the warning here means that all + * packets will be dropped. + */ +static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (rx) + dp->dccps_pcrlen = cscov; + else { + if (dp->dccps_pcslen == 0) + dp->dccps_pcslen = cscov; + else if (cscov > dp->dccps_pcslen) + DCCP_WARN("CsCov %u too small, peer requires >= %u\n", + dp->dccps_pcslen, (u8)cscov); + } + return 0; +} + static const struct { u8 feat_num; /* DCCPF_xxx */ enum dccp_feat_type rxtx; /* RX or TX */ enum dccp_feat_type reconciliation; /* SP or NN */ u8 default_value; /* as in 6.4 */ + int (*activation_hdlr)(struct sock *sk, u64 val, bool rx); /* * Lookup table for location and type of features (from RFC 4340/4342) * +--------------------------+----+-----+----+----+---------+-----------+ @@ -49,16 +139,16 @@ static const struct { * +--------------------------+----+-----+----+----+---------+-----------+ */ } dccp_feat_table[] = { - { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2 }, - { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0 }, - { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100 }, - { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0 }, - { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2 }, - { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0 }, - { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0 }, - { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0 }, - { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0 }, - { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0 }, + { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid }, + { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL }, + { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win }, + { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL }, + { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio}, + { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec }, + { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp }, + { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov}, + { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL }, + { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL }, }; #define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) @@ -99,6 +189,41 @@ static int dccp_feat_default_value(u8 feat_num) return idx < 0 ? : dccp_feat_table[idx].default_value; } +static int __dccp_feat_activate(struct sock *sk, const int idx, + const bool is_local, dccp_feat_val const *fval) +{ + bool rx; + u64 val; + + if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) + return -1; + if (dccp_feat_table[idx].activation_hdlr == NULL) + return 0; + + if (fval == NULL) { + val = dccp_feat_table[idx].default_value; + } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { + if (fval->sp.vec == NULL) { + /* + * This can happen when an empty Confirm is sent + * for an SP (i.e. known) feature. In this case + * we would be using the default anyway. + */ + DCCP_CRIT("Feature #%d undefined: using default", idx); + val = dccp_feat_table[idx].default_value; + } else { + val = fval->sp.vec[0]; + } + } else { + val = fval->nn; + } + + /* Location is RX if this is a local-RX or remote-TX feature */ + rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); + + return dccp_feat_table[idx].activation_hdlr(sk, val, rx); +} + /* Test for "Req'd" feature (RFC 4340, 6.4) */ static inline int dccp_feat_must_be_understood(u8 feat_num) { @@ -1506,6 +1631,74 @@ out: EXPORT_SYMBOL_GPL(dccp_feat_init); +int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_feat_entry *cur, *next; + int idx; + dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = { + [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL } + }; + + list_for_each_entry(cur, fn_list, node) { + /* + * An empty Confirm means that either an unknown feature type + * or an invalid value was present. In the first case there is + * nothing to activate, in the other the default value is used. + */ + if (cur->empty_confirm) + continue; + + idx = dccp_feat_index(cur->feat_num); + if (idx < 0) { + DCCP_BUG("Unknown feature %u", cur->feat_num); + goto activation_failed; + } + if (cur->state != FEAT_STABLE) { + DCCP_CRIT("Negotiation of %s %u failed in state %u", + cur->is_local ? "local" : "remote", + cur->feat_num, cur->state); + goto activation_failed; + } + fvals[idx][cur->is_local] = &cur->val; + } + + /* + * Activate in decreasing order of index, so that the CCIDs are always + * activated as the last feature. This avoids the case where a CCID + * relies on the initialisation of one or more features that it depends + * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features). + */ + for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;) + if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) || + __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) { + DCCP_CRIT("Could not activate %d", idx); + goto activation_failed; + } + + /* Clean up Change options which have been confirmed already */ + list_for_each_entry_safe(cur, next, fn_list, node) + if (!cur->needs_confirm) + dccp_feat_list_pop(cur); + + dccp_pr_debug("Activation OK\n"); + return 0; + +activation_failed: + /* + * We clean up everything that may have been allocated, since + * it is difficult to track at which stage negotiation failed. + * This is ok, since all allocation functions below are robust + * against NULL arguments. + */ + ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + dccp_ackvec_free(dp->dccps_hc_rx_ackvec); + dp->dccps_hc_rx_ackvec = NULL; + return -1; +} + #ifdef CONFIG_IP_DCCP_DEBUG const char *dccp_feat_typename(const u8 type) { From 3a53a9adfa269da7fa40fc476f09e46155c0143d Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 032/105] dccp: Integration of dynamic feature activation - part 1 (socket setup) This first patch out of three replaces the hardcoded default settings with initialisation code for the dynamic feature negotiation. Note on retransmitting Confirm options: --------------------------------------- This patch also defers flushing the client feature-negotiation queue, due to the following considerations. As long as the client is in PARTOPEN, it needs to retransmit the Confirm options for the Change options received on the DCCP-Response from the server. Otherwise, if the packet containing the Confirm options gets dropped in the network, the connection aborts due to undefined feature negotiation state. Thanks to Leandro Melo de Sales who reported a bug in an earlier revision of the patch set, resulting from not retransmitting the Confirm options. The patch now ensures that the client feature-negotiation queue is flushed only when entering the OPEN state. Since confirmed Change options are removed as soon as they are confirmed (in the DCCP-Response), this ensures that Confirm options are retransmitted. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/proto.c | 46 ++++++---------------------------------------- 1 file changed, 6 insertions(+), 40 deletions(-) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 6550452d59e2..0d420790b795 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -67,6 +67,9 @@ void dccp_set_state(struct sock *sk, const int state) case DCCP_OPEN: if (oldstate != DCCP_OPEN) DCCP_INC_STATS(DCCP_MIB_CURRESTAB); + /* Client retransmits all Confirm options until entering OPEN */ + if (oldstate == DCCP_PARTOPEN) + dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); break; case DCCP_CLOSED: @@ -175,7 +178,6 @@ EXPORT_SYMBOL_GPL(dccp_state_name); int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) { struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); struct inet_connection_sock *icsk = inet_csk(sk); dccp_minisock_init(&dp->dccps_minisock); @@ -194,45 +196,9 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) dccp_init_xmit_timers(sk); INIT_LIST_HEAD(&dp->dccps_featneg); - /* - * FIXME: We're hardcoding the CCID, and doing this at this point makes - * the listening (master) sock get CCID control blocks, which is not - * necessary, but for now, to not mess with the test userspace apps, - * lets leave it here, later the real solution is to do this in a - * setsockopt(CCIDs-I-want/accept). -acme - */ - if (likely(ctl_sock_initialized)) { - int rc = dccp_feat_init(sk); - - if (rc) - return rc; - - if (dmsk->dccpms_send_ack_vector) { - dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL); - if (dp->dccps_hc_rx_ackvec == NULL) - return -ENOMEM; - } - dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid, - sk, GFP_KERNEL); - dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid, - sk, GFP_KERNEL); - if (unlikely(dp->dccps_hc_rx_ccid == NULL || - dp->dccps_hc_tx_ccid == NULL)) { - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - if (dmsk->dccpms_send_ack_vector) { - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - } - dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; - return -ENOMEM; - } - } else { - /* control socket doesn't need feat nego */ - INIT_LIST_HEAD(&dmsk->dccpms_pending); - INIT_LIST_HEAD(&dmsk->dccpms_conf); - } - + /* control socket doesn't need feat nego */ + if (likely(ctl_sock_initialized)) + return dccp_feat_init(sk); return 0; } From e70cacb90d76f0632f7bba69c87a62e709e84619 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 033/105] dccp: Integration of dynamic feature activation - part 2 (server side) This patch integrates the activation of features at the end of negotiation into the server-side code. Note: In dccp_create_openreq_child the request_sock argument is no longer constant, since dccp_activate_values() uses the feature-negotiation list on dreq to sort out the initialisation values for the different features of the child socket; and purges this queue after use (but the `req' argument to openreq_child can and does still remain constant). Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/minisocks.c | 42 ++++++++++++------------------------------ 1 file changed, 12 insertions(+), 30 deletions(-) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index ee7f40f8ddfa..258195972bce 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -111,7 +111,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk, struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); if (newsk != NULL) { - const struct dccp_request_sock *dreq = dccp_rsk(req); + struct dccp_request_sock *dreq = dccp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); struct dccp_sock *newdp = dccp_sk(newsk); struct dccp_minisock *newdmsk = dccp_msk(newsk); @@ -125,35 +125,6 @@ struct sock *dccp_create_openreq_child(struct sock *sk, newicsk->icsk_rto = DCCP_TIMEOUT_INIT; INIT_LIST_HEAD(&newdp->dccps_featneg); - if (dccp_feat_clone(sk, newsk)) - goto out_free; - - if (newdmsk->dccpms_send_ack_vector) { - newdp->dccps_hc_rx_ackvec = - dccp_ackvec_alloc(GFP_ATOMIC); - if (unlikely(newdp->dccps_hc_rx_ackvec == NULL)) - goto out_free; - } - - newdp->dccps_hc_rx_ccid = - ccid_hc_rx_new(newdmsk->dccpms_rx_ccid, - newsk, GFP_ATOMIC); - newdp->dccps_hc_tx_ccid = - ccid_hc_tx_new(newdmsk->dccpms_tx_ccid, - newsk, GFP_ATOMIC); - if (unlikely(newdp->dccps_hc_rx_ccid == NULL || - newdp->dccps_hc_tx_ccid == NULL)) { - dccp_ackvec_free(newdp->dccps_hc_rx_ackvec); - ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk); - ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk); -out_free: - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - sk_free(newsk); - return NULL; - } - /* * Step 3: Process LISTEN state * @@ -184,6 +155,17 @@ out_free: dccp_set_seqno(&newdp->dccps_awl, max48(newdp->dccps_awl, newdp->dccps_iss)); + /* + * Activate features after initialising the sequence numbers, + * since CCID initialisation may depend on GSS, ISR, ISS etc. + */ + if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; + sk_free(newsk); + return NULL; + } dccp_init_xmit_timers(newsk); DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); From c49b22729f3da7479c4e6c572d53fdd40201d0bd Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 034/105] dccp: Integration of dynamic feature activation - part 3 (client side) This integrates feature-activation in the client, with these details: 1. When dccp_parse_options() fails, the reset code is already set, request_sent _state_process() currently overrides this with `Packet Error', which is not intended - so changed to use the reset code set in dccp_parse_options(); 2. There was a FIXME to change the error code when dccp_ackvec_add() fails. I have looked this up and found that: * the check whether ackno < ISN is already made earlier, * this Response is likely the 1st packet with an Ackno that the client gets, * so when dccp_ackvec_add() fails, the reason is likely not a packet error. 3. When feature negotiation fails, the socket should be marked as not usable, so that the application is notified that an error occurs. This is achieved by a new label, which uses an error code of `Aborted' and which sets the socket state to CLOSED, as well as sk_err. 4. Avoids parsing the Ack twice in Respond state by not doing option processing again in dccp_rcv_respond_partopen_state_process (as option processing has already been done on the request_sock in dccp_check_req). Since this addresses congestion-control initialisation, a corresponding FIXME has been removed. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/input.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/net/dccp/input.c b/net/dccp/input.c index 3070015edc75..0672b7e1763e 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -421,8 +421,13 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } + /* + * If option processing (Step 8) failed, return 1 here so that + * dccp_v4_do_rcv() sends a Reset. The Reset code depends on + * the option type and is set in dccp_parse_options(). + */ if (dccp_parse_options(sk, NULL, skb)) - goto out_invalid_packet; + return 1; /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) @@ -475,6 +480,15 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, */ dccp_set_state(sk, DCCP_PARTOPEN); + /* + * If feature negotiation was successful, activate features now; + * an activation failure means that this host could not activate + * one ore more features (e.g. insufficient memory), which would + * leave at least one feature in an undefined state. + */ + if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) + goto unable_to_proceed; + /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); @@ -509,6 +523,16 @@ out_invalid_packet: /* dccp_v4_do_rcv will send a reset */ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; return 1; + +unable_to_proceed: + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; + /* + * We mark this socket as no longer usable, so that the loop in + * dccp_sendmsg() terminates and the application gets notified. + */ + dccp_set_state(sk, DCCP_CLOSED); + sk->sk_err = ECOMM; + return 1; } static int dccp_rcv_respond_partopen_state_process(struct sock *sk, @@ -600,7 +624,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 1; } - if (sk->sk_state != DCCP_REQUESTING) { + if (sk->sk_state != DCCP_REQUESTING && sk->sk_state != DCCP_RESPOND) { if (dccp_check_seqno(sk, skb)) goto discard; @@ -665,8 +689,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 1; case DCCP_REQUESTING: - /* FIXME: do congestion control initialization */ - queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); if (queued >= 0) return queued; From 23479cbfd30402c7d9fa413cc467983061073557 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 035/105] dccp: Clean up old feature-negotiation infrastructure The code removed by this patch is no longer referenced or used, the added lines update documentation and copyrights. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/feat.c | 505 +----------------------------------------------- net/dccp/feat.h | 12 +- 2 files changed, 12 insertions(+), 505 deletions(-) diff --git a/net/dccp/feat.c b/net/dccp/feat.c index f78bd357b5b1..6c82dea409d9 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1,8 +1,13 @@ /* * net/dccp/feat.c * - * An implementation of the DCCP protocol - * Andrea Bittau + * Feature negotiation for the DCCP protocol (RFC 4340, section 6) + * + * Copyright (c) 2008 The University of Aberdeen, Scotland, UK + * Copyright (c) 2008 Gerrit Renker + * Rewrote from scratch, some bits from earlier code by + * Copyright (c) 2005 Andrea Bittau + * * * ASSUMPTIONS * ----------- @@ -17,14 +22,10 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ - #include - #include "ccid.h" #include "feat.h" -#define DCCP_FEAT_SP_NOAGREE (-123) - /* * Feature activation handlers. * @@ -805,51 +806,6 @@ int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) return 0; } -static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); - /* figure out if we are changing our CCID or the peer's */ - const int rx = type == DCCPO_CHANGE_R; - const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid; - struct ccid *new_ccid; - - /* Check if nothing is being changed. */ - if (ccid_nr == new_ccid_nr) - return 0; - - new_ccid = ccid_new(new_ccid_nr, sk, rx, GFP_ATOMIC); - if (new_ccid == NULL) - return -ENOMEM; - - if (rx) { - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - dp->dccps_hc_rx_ccid = new_ccid; - dmsk->dccpms_rx_ccid = new_ccid_nr; - } else { - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_tx_ccid = new_ccid; - dmsk->dccpms_tx_ccid = new_ccid_nr; - } - - return 0; -} - -static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val) -{ - dccp_feat_debug(type, feat, val); - - switch (feat) { - case DCCPF_CCID: - return dccp_feat_update_ccid(sk, type, val); - default: - dccp_pr_debug("UNIMPLEMENTED: %s(%d, ...)\n", - dccp_feat_typename(type), feat); - break; - } - return 0; -} - /* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) { @@ -919,453 +875,6 @@ static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); } -#ifdef __this_is_the_old_framework_and_will_be_removed_later_in_a_subsequent_patch -static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt, - u8 *rpref, u8 rlen) -{ - struct dccp_sock *dp = dccp_sk(sk); - u8 *spref, slen, *res = NULL; - int i, j, rc, agree = 1; - - BUG_ON(rpref == NULL); - - /* check if we are the black sheep */ - if (dp->dccps_role == DCCP_ROLE_CLIENT) { - spref = rpref; - slen = rlen; - rpref = opt->dccpop_val; - rlen = opt->dccpop_len; - } else { - spref = opt->dccpop_val; - slen = opt->dccpop_len; - } - /* - * Now we have server preference list in spref and client preference in - * rpref - */ - BUG_ON(spref == NULL); - BUG_ON(rpref == NULL); - - /* FIXME sanity check vals */ - - /* Are values in any order? XXX Lame "algorithm" here */ - for (i = 0; i < slen; i++) { - for (j = 0; j < rlen; j++) { - if (spref[i] == rpref[j]) { - res = &spref[i]; - break; - } - } - if (res) - break; - } - - /* we didn't agree on anything */ - if (res == NULL) { - /* confirm previous value */ - switch (opt->dccpop_feat) { - case DCCPF_CCID: - /* XXX did i get this right? =P */ - if (opt->dccpop_type == DCCPO_CHANGE_L) - res = &dccp_msk(sk)->dccpms_tx_ccid; - else - res = &dccp_msk(sk)->dccpms_rx_ccid; - break; - - default: - DCCP_BUG("Fell through, feat=%d", opt->dccpop_feat); - /* XXX implement res */ - return -EFAULT; - } - - dccp_pr_debug("Don't agree... reconfirming %d\n", *res); - agree = 0; /* this is used for mandatory options... */ - } - - /* need to put result and our preference list */ - rlen = 1 + opt->dccpop_len; - rpref = kmalloc(rlen, GFP_ATOMIC); - if (rpref == NULL) - return -ENOMEM; - - *rpref = *res; - memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len); - - /* put it in the "confirm queue" */ - if (opt->dccpop_sc == NULL) { - opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC); - if (opt->dccpop_sc == NULL) { - kfree(rpref); - return -ENOMEM; - } - } else { - /* recycle the confirm slot */ - BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); - kfree(opt->dccpop_sc->dccpoc_val); - dccp_pr_debug("recycling confirm slot\n"); - } - memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc)); - - opt->dccpop_sc->dccpoc_val = rpref; - opt->dccpop_sc->dccpoc_len = rlen; - - /* update the option on our side [we are about to send the confirm] */ - rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res); - if (rc) { - kfree(opt->dccpop_sc->dccpoc_val); - kfree(opt->dccpop_sc); - opt->dccpop_sc = NULL; - return rc; - } - - dccp_pr_debug("Will confirm %d\n", *rpref); - - /* say we want to change to X but we just got a confirm X, suppress our - * change - */ - if (!opt->dccpop_conf) { - if (*opt->dccpop_val == *res) - opt->dccpop_conf = 1; - dccp_pr_debug("won't ask for change of same feature\n"); - } - - return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */ -} - -static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) -{ - struct dccp_minisock *dmsk = dccp_msk(sk); - struct dccp_opt_pend *opt; - int rc = 1; - u8 t; - - /* - * We received a CHANGE. We gotta match it against our own preference - * list. If we got a CHANGE_R it means it's a change for us, so we need - * to compare our CHANGE_L list. - */ - if (type == DCCPO_CHANGE_L) - t = DCCPO_CHANGE_R; - else - t = DCCPO_CHANGE_L; - - /* find our preference list for this feature */ - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { - if (opt->dccpop_type != t || opt->dccpop_feat != feature) - continue; - - /* find the winner from the two preference lists */ - rc = dccp_feat_reconcile(sk, opt, val, len); - break; - } - - /* We didn't deal with the change. This can happen if we have no - * preference list for the feature. In fact, it just shouldn't - * happen---if we understand a feature, we should have a preference list - * with at least the default value. - */ - BUG_ON(rc == 1); - - return rc; -} - -static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) -{ - struct dccp_opt_pend *opt; - struct dccp_minisock *dmsk = dccp_msk(sk); - u8 *copy; - int rc; - - /* NN features must be Change L (sec. 6.3.2) */ - if (type != DCCPO_CHANGE_L) { - dccp_pr_debug("received %s for NN feature %d\n", - dccp_feat_typename(type), feature); - return -EFAULT; - } - - /* XXX sanity check opt val */ - - /* copy option so we can confirm it */ - opt = kzalloc(sizeof(*opt), GFP_ATOMIC); - if (opt == NULL) - return -ENOMEM; - - copy = kmemdup(val, len, GFP_ATOMIC); - if (copy == NULL) { - kfree(opt); - return -ENOMEM; - } - - opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */ - opt->dccpop_feat = feature; - opt->dccpop_val = copy; - opt->dccpop_len = len; - - /* change feature */ - rc = dccp_feat_update(sk, type, feature, *val); - if (rc) { - kfree(opt->dccpop_val); - kfree(opt); - return rc; - } - - dccp_feat_debug(type, feature, *copy); - - list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); - - return 0; -} - -static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk, - u8 type, u8 feature) -{ - /* XXX check if other confirms for that are queued and recycle slot */ - struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC); - - if (opt == NULL) { - /* XXX what do we do? Ignoring should be fine. It's a change - * after all =P - */ - return; - } - - switch (type) { - case DCCPO_CHANGE_L: - opt->dccpop_type = DCCPO_CONFIRM_R; - break; - case DCCPO_CHANGE_R: - opt->dccpop_type = DCCPO_CONFIRM_L; - break; - default: - DCCP_WARN("invalid type %d\n", type); - kfree(opt); - return; - } - opt->dccpop_feat = feature; - opt->dccpop_val = NULL; - opt->dccpop_len = 0; - - /* change feature */ - dccp_pr_debug("Empty %s(%d)\n", dccp_feat_typename(type), feature); - - list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); -} - -static void dccp_feat_flush_confirm(struct sock *sk) -{ - struct dccp_minisock *dmsk = dccp_msk(sk); - /* Check if there is anything to confirm in the first place */ - int yes = !list_empty(&dmsk->dccpms_conf); - - if (!yes) { - struct dccp_opt_pend *opt; - - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { - if (opt->dccpop_conf) { - yes = 1; - break; - } - } - } - - if (!yes) - return; - - /* OK there is something to confirm... */ - /* XXX check if packet is in flight? Send delayed ack?? */ - if (sk->sk_state == DCCP_OPEN) - dccp_send_ack(sk); -} - -int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) -{ - int rc; - - /* Ignore Change requests other than during connection setup */ - if (sk->sk_state != DCCP_LISTEN && sk->sk_state != DCCP_REQUESTING) - return 0; - dccp_feat_debug(type, feature, *val); - - /* figure out if it's SP or NN feature */ - switch (feature) { - /* deal with SP features */ - case DCCPF_CCID: - /* XXX Obsoleted by next patch - rc = dccp_feat_sp(sk, type, feature, val, len); */ - break; - - /* deal with NN features */ - case DCCPF_ACK_RATIO: - /* XXX Obsoleted by next patch - rc = dccp_feat_nn(sk, type, feature, val, len); */ - break; - - /* XXX implement other features */ - default: - dccp_pr_debug("UNIMPLEMENTED: not handling %s(%d, ...)\n", - dccp_feat_typename(type), feature); - rc = -EFAULT; - break; - } - - /* check if there were problems changing features */ - if (rc) { - /* If we don't agree on SP, we sent a confirm for old value. - * However we propagate rc to caller in case option was - * mandatory - */ - if (rc != DCCP_FEAT_SP_NOAGREE) - dccp_feat_empty_confirm(dccp_msk(sk), type, feature); - } - - /* generate the confirm [if required] */ - dccp_feat_flush_confirm(sk); - - return rc; -} - -EXPORT_SYMBOL_GPL(dccp_feat_change_recv); - -int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, - u8 *val, u8 len) -{ - u8 t; - struct dccp_opt_pend *opt; - struct dccp_minisock *dmsk = dccp_msk(sk); - int found = 0; - int all_confirmed = 1; - - /* Ignore Confirm options other than during connection setup */ - if (sk->sk_state != DCCP_LISTEN && sk->sk_state != DCCP_REQUESTING) - return 0; - dccp_feat_debug(type, feature, *val); - - /* locate our change request */ - switch (type) { - case DCCPO_CONFIRM_L: t = DCCPO_CHANGE_R; break; - case DCCPO_CONFIRM_R: t = DCCPO_CHANGE_L; break; - default: DCCP_WARN("invalid type %d\n", type); - return 1; - - } - /* XXX sanity check feature value */ - - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { - if (!opt->dccpop_conf && opt->dccpop_type == t && - opt->dccpop_feat == feature) { - found = 1; - dccp_pr_debug("feature %d found\n", opt->dccpop_feat); - - /* XXX do sanity check */ - - opt->dccpop_conf = 1; - - /* We got a confirmation---change the option */ - dccp_feat_update(sk, opt->dccpop_type, - opt->dccpop_feat, *val); - - /* XXX check the return value of dccp_feat_update */ - break; - } - - if (!opt->dccpop_conf) - all_confirmed = 0; - } - - if (!found) - dccp_pr_debug("%s(%d, ...) never requested\n", - dccp_feat_typename(type), feature); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv); -#endif /* (later) */ - -void dccp_feat_clean(struct dccp_minisock *dmsk) -{ - struct dccp_opt_pend *opt, *next; - - list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending, - dccpop_node) { - BUG_ON(opt->dccpop_val == NULL); - kfree(opt->dccpop_val); - - if (opt->dccpop_sc != NULL) { - BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); - kfree(opt->dccpop_sc->dccpoc_val); - kfree(opt->dccpop_sc); - } - - kfree(opt); - } - INIT_LIST_HEAD(&dmsk->dccpms_pending); - - list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { - BUG_ON(opt == NULL); - if (opt->dccpop_val != NULL) - kfree(opt->dccpop_val); - kfree(opt); - } - INIT_LIST_HEAD(&dmsk->dccpms_conf); -} - -EXPORT_SYMBOL_GPL(dccp_feat_clean); - -/* this is to be called only when a listening sock creates its child. It is - * assumed by the function---the confirm is not duplicated, but rather it is - * "passed on". - */ -int dccp_feat_clone(struct sock *oldsk, struct sock *newsk) -{ - struct dccp_minisock *olddmsk = dccp_msk(oldsk); - struct dccp_minisock *newdmsk = dccp_msk(newsk); - struct dccp_opt_pend *opt; - int rc = 0; - - INIT_LIST_HEAD(&newdmsk->dccpms_pending); - INIT_LIST_HEAD(&newdmsk->dccpms_conf); - - list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) { - struct dccp_opt_pend *newopt; - /* copy the value of the option */ - u8 *val = kmemdup(opt->dccpop_val, opt->dccpop_len, GFP_ATOMIC); - - if (val == NULL) - goto out_clean; - - newopt = kmemdup(opt, sizeof(*newopt), GFP_ATOMIC); - if (newopt == NULL) { - kfree(val); - goto out_clean; - } - - /* insert the option */ - newopt->dccpop_val = val; - list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending); - - /* XXX what happens with backlogs and multiple connections at - * once... - */ - /* the master socket no longer needs to worry about confirms */ - opt->dccpop_sc = NULL; /* it's not a memleak---new socket has it */ - - /* reset state for a new socket */ - opt->dccpop_conf = 0; - } - - /* XXX not doing anything about the conf queue */ - -out: - return rc; - -out_clean: - dccp_feat_clean(newdmsk); - rc = -ENOMEM; - goto out; -} - -EXPORT_SYMBOL_GPL(dccp_feat_clone); - /** * dccp_feat_change_recv - Process incoming ChangeL/R options * @fn: feature-negotiation list to update diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 618bed935b33..177e9c39ea9f 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -3,14 +3,14 @@ /* * net/dccp/feat.h * - * An implementation of the DCCP protocol + * Feature negotiation for the DCCP protocol (RFC 4340, section 6) + * Copyright (c) 2008 Gerrit Renker * Copyright (c) 2005 Andrea Bittau * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ - #include #include "dccp.h" @@ -117,8 +117,6 @@ extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, u8 mand, u8 opt, u8 feat, u8 *val, u8 len); -extern void dccp_feat_clean(struct dccp_minisock *dmsk); -extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); extern int dccp_feat_init(struct sock *sk); From 78673e24df27c76ec75565f4024d45c2c74ef148 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 036/105] dccp: Remove obsolete parts of the old CCID interface The TX/RX CCIDs of the minisock are now redundant: similar to the Ack Vector case, their value equals initially that of the sysctl, but at the end of feature negotiation may be something different. The old interface removed by this patch thus has been replaced by the newer interface to dynamically query the currently loaded CCIDs earlier in this patch set. Also removed the constructors for the TX CCID and the RX CCID, since the switch rx/non-rx is done by the handler in minisocks.c (and the handler is the only place in the code where CCIDs are loaded). Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- Documentation/networking/dccp.txt | 5 +++-- include/linux/dccp.h | 3 --- net/dccp/ccid.c | 14 -------------- net/dccp/ccid.h | 5 ----- net/dccp/feat.c | 12 ------------ net/dccp/minisocks.c | 2 -- 6 files changed, 3 insertions(+), 38 deletions(-) diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 610083ff73f6..a203d132dbef 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -140,10 +140,11 @@ send_ackvec = 1 Whether or not to send Ack Vector options (sec. 11.5). tx_ccid = 2 - Default CCID for the sender-receiver half-connection. + Default CCID for the sender-receiver half-connection. Depending on the + choice of CCID, the Send Ack Vector feature is enabled automatically. rx_ccid = 2 - Default CCID for the receiver-sender half-connection. + Default CCID for the receiver-sender half-connection; see tx_ccid. seq_window = 100 The initial sequence window (sec. 7.5.2). diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 6a72ff52a8a4..46daea312d92 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -370,7 +370,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * Will be used to pass the state from dccp_request_sock to dccp_sock. * * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) - * @dccpms_ccid - Congestion Control Id (CCID) (section 10) * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) * @dccpms_pending - List of features being negotiated @@ -378,8 +377,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) */ struct dccp_minisock { __u64 dccpms_sequence_window; - __u8 dccpms_rx_ccid; - __u8 dccpms_tx_ccid; __u8 dccpms_send_ack_vector; __u8 dccpms_send_ndp_count; struct list_head dccpms_pending; diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index f72ca83df552..330372a1a0b6 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -253,20 +253,6 @@ out_module_put: EXPORT_SYMBOL_GPL(ccid_new); -struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp) -{ - return ccid_new(id, sk, 1, gfp); -} - -EXPORT_SYMBOL_GPL(ccid_hc_rx_new); - -struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk, gfp_t gfp) -{ - return ccid_new(id, sk, 0, gfp); -} - -EXPORT_SYMBOL_GPL(ccid_hc_tx_new); - static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx) { struct ccid_operations *ccid_ops; diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 803343aed004..18f69423a708 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -111,11 +111,6 @@ extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp); -extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, - gfp_t gfp); -extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk, - gfp_t gfp); - static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) { struct ccid *ccid = dp->dccps_hc_rx_ccid; diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 6c82dea409d9..cb2ddd2b894b 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1119,18 +1119,6 @@ int dccp_feat_init(struct sock *sk) INIT_LIST_HEAD(&dmsk->dccpms_pending); /* XXX no longer used */ INIT_LIST_HEAD(&dmsk->dccpms_conf); /* XXX no longer used */ - /* CCID L */ - rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 1, 0, - &dmsk->dccpms_tx_ccid, 1); - if (rc) - goto out; - - /* CCID R */ - rc = __feat_register_sp(&dp->dccps_featneg, DCCPF_CCID, 0, 0, - &dmsk->dccpms_rx_ccid, 1); - if (rc) - goto out; - /* Ack ratio */ rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0, dp->dccps_l_ack_ratio); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 258195972bce..486d61df2604 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -45,8 +45,6 @@ EXPORT_SYMBOL_GPL(dccp_death_row); void dccp_minisock_init(struct dccp_minisock *dmsk) { dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; - dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid; - dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid; dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; } From 68e074bfcef269bc61006c2740d7f89ccbbd93d7 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 037/105] dccp: Remove manual influence on NDP Count feature Updating the NDP count feature is handled automatically now: * for CCID-2 it is disabled, since the code does not use NDP counts; * for CCID-3 it is enabled, as NDP counts are used to determine loss lengths. Allowing the user to change NDP values leads to unpredictable and failing behaviour, since it is then possible to disable NDP counts even when they are needed (e.g. in CCID-3). This means that only those user settings are sensible that agree with the values for Send NDP Count implied by the choice of CCID. But those settings are already activated by the feature negotiation (CCID dependency tracking), hence this form of support is redundant. At startup the initialisation of the NDP count feature is with the default value of 0, which is done implicitly by the zeroing-out of the socket when it is allocated. If the choice of CCID or feature negotiation enables NDP count, this will then be updated via the NDP activation handler. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- Documentation/networking/dccp.txt | 3 --- include/linux/dccp.h | 4 ++-- net/dccp/dccp.h | 1 - net/dccp/feat.c | 2 +- net/dccp/minisocks.c | 1 - net/dccp/options.c | 4 +--- net/dccp/sysctl.c | 7 ------- 7 files changed, 4 insertions(+), 18 deletions(-) diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index a203d132dbef..1403745ab406 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -133,9 +133,6 @@ retries2 importance for retransmitted acknowledgments and feature negotiation, data packets are never retransmitted. Analogue of tcp_retries2. -send_ndp = 1 - Whether or not to send NDP count options (sec. 7.7.2). - send_ackvec = 1 Whether or not to send Ack Vector options (sec. 11.5). diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 46daea312d92..60e94438eadd 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -371,14 +371,12 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) - * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) * @dccpms_pending - List of features being negotiated * @dccpms_conf - */ struct dccp_minisock { __u64 dccpms_sequence_window; __u8 dccpms_send_ack_vector; - __u8 dccpms_send_ndp_count; struct list_head dccpms_pending; struct list_head dccpms_conf; }; @@ -490,6 +488,7 @@ struct dccp_ackvec; * @dccps_r_ack_ratio - feature-remote Ack Ratio * @dccps_pcslen - sender partial checksum coverage (via sockopt) * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) + * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) * @dccps_ndp_count - number of Non Data Packets since last data packet * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) @@ -529,6 +528,7 @@ struct dccp_sock { __u16 dccps_r_ack_ratio; __u8 dccps_pcslen:4; __u8 dccps_pcrlen:4; + __u8 dccps_send_ndp_count:1; __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; struct dccp_minisock dccps_minisock; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 1baed789bcc2..51436c825655 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -99,7 +99,6 @@ extern int sysctl_dccp_feat_sequence_window; extern int sysctl_dccp_feat_rx_ccid; extern int sysctl_dccp_feat_tx_ccid; extern int sysctl_dccp_feat_send_ack_vector; -extern int sysctl_dccp_feat_send_ndp_count; extern int sysctl_dccp_tx_qlen; extern int sysctl_dccp_sync_ratelimit; diff --git a/net/dccp/feat.c b/net/dccp/feat.c index cb2ddd2b894b..35a57ab3bb1e 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -86,7 +86,7 @@ static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) { if (!rx) - dccp_msk(sk)->dccpms_send_ndp_count = (enable > 0); + dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); return 0; } diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 486d61df2604..9e2232572662 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -46,7 +46,6 @@ void dccp_minisock_init(struct dccp_minisock *dmsk) { dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; - dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; } void dccp_time_wait(struct sock *sk, int state, int timeo) diff --git a/net/dccp/options.c b/net/dccp/options.c index 3a9a22f0ac1a..6b0704497e83 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -27,7 +27,6 @@ int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; -int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; u64 dccp_decode_value_var(const u8 *bf, const u8 len) { @@ -531,8 +530,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - if (dmsk->dccpms_send_ndp_count && - dccp_insert_option_ndp(sk, skb)) + if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) return -1; if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index f6e54f433e29..587c12f915c1 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -47,13 +47,6 @@ static struct ctl_table dccp_default_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "send_ndp", - .data = &sysctl_dccp_feat_send_ndp_count, - .maxlen = sizeof(sysctl_dccp_feat_send_ndp_count), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "request_retries", .data = &sysctl_dccp_request_retries, From b235dc4abbc1356284bd0dc730efa711f394e0e2 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 038/105] dccp ccid-2: Phase out the use of boolean Ack Vector sysctl This removes the use of the sysctl and the minisock variable for the Send Ack Vector feature, which is now handled fully dynamically via feature negotiation; i.e. when CCID2 is enabled, Ack Vectors are automatically enabled (as per RFC 4341, 4.). Using a sysctl in parallel to this implementation would open the door to crashes, since much of the code relies on tests of the boolean minisock / sysctl variable. Thus, this patch replaces all tests of type if (dccp_msk(sk)->dccpms_send_ack_vector) /* ... */ with if (dp->dccps_hc_rx_ackvec != NULL) /* ... */ The dccps_hc_rx_ackvec is allocated by the dccp_hdlr_ackvec() when feature negotiation concluded that Ack Vectors are to be used on the half-connection. Otherwise, it is NULL (due to dccp_init_sock/dccp_create_openreq_child), so that the test is a valid one. The activation handler for Ack Vectors is called as soon as the feature negotiation has concluded at the * server when the Ack marking the transition RESPOND => OPEN arrives; * client after it has sent its ACK, marking the transition REQUEST => PARTOPEN. Adding the sequence number of the Response packet to the Ack Vector has been removed, since (a) connection establishment implies that the Response has been received; (b) the CCIDs only look at packets received in the (PART)OPEN state, i.e. this entry will always be ignored; (c) it can not be used for anything useful - to detect loss for instance, only packets received after the loss can serve as pseudo-dupacks. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- Documentation/networking/dccp.txt | 3 --- include/linux/dccp.h | 3 --- net/dccp/dccp.h | 3 +-- net/dccp/diag.c | 2 +- net/dccp/input.c | 12 +++--------- net/dccp/minisocks.c | 1 - net/dccp/options.c | 7 ++----- net/dccp/proto.c | 3 +-- net/dccp/sysctl.c | 7 ------- 9 files changed, 8 insertions(+), 33 deletions(-) diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 1403745ab406..7a3bb1abb830 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -133,9 +133,6 @@ retries2 importance for retransmitted acknowledgments and feature negotiation, data packets are never retransmitted. Analogue of tcp_retries2. -send_ackvec = 1 - Whether or not to send Ack Vector options (sec. 11.5). - tx_ccid = 2 Default CCID for the sender-receiver half-connection. Depending on the choice of CCID, the Send Ack Vector feature is enabled automatically. diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 60e94438eadd..61734e27abb7 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -360,7 +360,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) #define DCCPF_INITIAL_SEQUENCE_WINDOW 100 #define DCCPF_INITIAL_ACK_RATIO 2 #define DCCPF_INITIAL_CCID DCCPC_CCID2 -#define DCCPF_INITIAL_SEND_ACK_VECTOR 1 /* FIXME: for now we're default to 1 but it should really be 0 */ #define DCCPF_INITIAL_SEND_NDP_COUNT 1 @@ -370,13 +369,11 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * Will be used to pass the state from dccp_request_sock to dccp_sock. * * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) - * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) * @dccpms_pending - List of features being negotiated * @dccpms_conf - */ struct dccp_minisock { __u64 dccpms_sequence_window; - __u8 dccpms_send_ack_vector; struct list_head dccpms_pending; struct list_head dccpms_conf; }; diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 51436c825655..3fd16e82c003 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -98,7 +98,6 @@ extern int sysctl_dccp_retries2; extern int sysctl_dccp_feat_sequence_window; extern int sysctl_dccp_feat_rx_ccid; extern int sysctl_dccp_feat_tx_ccid; -extern int sysctl_dccp_feat_send_ack_vector; extern int sysctl_dccp_tx_qlen; extern int sysctl_dccp_sync_ratelimit; @@ -434,7 +433,7 @@ static inline int dccp_ack_pending(const struct sock *sk) const struct dccp_sock *dp = dccp_sk(sk); return dp->dccps_timestamp_echo != 0 || #ifdef CONFIG_IP_DCCP_ACKVEC - (dccp_msk(sk)->dccpms_send_ack_vector && + (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || #endif inet_csk_ack_scheduled(sk); diff --git a/net/dccp/diag.c b/net/dccp/diag.c index d8a3509b26f6..93aae7c95550 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -29,7 +29,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_backoff = icsk->icsk_backoff; info->tcpi_pmtu = icsk->icsk_pmtu_cookie; - if (dccp_msk(sk)->dccpms_send_ack_vector) + if (dp->dccps_hc_rx_ackvec != NULL) info->tcpi_options |= TCPI_OPT_SACK; ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); diff --git a/net/dccp/input.c b/net/dccp/input.c index 0672b7e1763e..5eb443f656c1 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -163,7 +163,7 @@ static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); - if (dccp_msk(sk)->dccpms_send_ack_vector) + if (dp->dccps_hc_rx_ackvec != NULL) dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_ack_seq); } @@ -375,7 +375,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); - if (dccp_msk(sk)->dccpms_send_ack_vector && + if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKVEC_STATE_RECEIVED)) @@ -434,12 +434,6 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - dp->dccps_options_received.dccpor_timestamp_echo)); - if (dccp_msk(sk)->dccpms_send_ack_vector && - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_ACKVEC_STATE_RECEIVED)) - goto out_invalid_packet; /* FIXME: change error code */ - /* Stop the REQUEST timer */ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); WARN_ON(sk->sk_send_head == NULL); @@ -637,7 +631,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); - if (dccp_msk(sk)->dccpms_send_ack_vector && + if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKVEC_STATE_RECEIVED)) diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 9e2232572662..0ebf8ebcf3de 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -45,7 +45,6 @@ EXPORT_SYMBOL_GPL(dccp_death_row); void dccp_minisock_init(struct dccp_minisock *dmsk) { dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; - dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; } void dccp_time_wait(struct sock *sk, int state, int timeo) diff --git a/net/dccp/options.c b/net/dccp/options.c index 6b0704497e83..aca309e16632 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -26,7 +26,6 @@ int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; -int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; u64 dccp_decode_value_var(const u8 *bf, const u8 len) { @@ -145,8 +144,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, case DCCPO_ACK_VECTOR_1: if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ break; - - if (dccp_msk(sk)->dccpms_send_ack_vector && + if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_parse(sk, skb, &ackno, opt, value, len)) goto out_invalid_option; break; @@ -526,7 +524,6 @@ static void dccp_insert_option_padding(struct sk_buff *skb) int dccp_insert_options(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); DCCP_SKB_CB(skb)->dccpd_opt_len = 0; @@ -547,7 +544,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) if (dccp_insert_option_timestamp(sk, skb)) return -1; - } else if (dmsk->dccpms_send_ack_vector && + } else if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && dccp_insert_option_ackvec(sk, skb)) { return -1; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 0d420790b795..775eaa3d0c49 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -207,7 +207,6 @@ EXPORT_SYMBOL_GPL(dccp_init_sock); void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); /* * DCCP doesn't use sk_write_queue, just sk_send_head @@ -225,7 +224,7 @@ void dccp_destroy_sock(struct sock *sk) kfree(dp->dccps_service_list); dp->dccps_service_list = NULL; - if (dmsk->dccpms_send_ack_vector) { + if (dp->dccps_hc_rx_ackvec != NULL) { dccp_ackvec_free(dp->dccps_hc_rx_ackvec); dp->dccps_hc_rx_ackvec = NULL; } diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index 587c12f915c1..018e210875e1 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -40,13 +40,6 @@ static struct ctl_table dccp_default_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "send_ackvec", - .data = &sysctl_dccp_feat_send_ack_vector, - .maxlen = sizeof(sysctl_dccp_feat_send_ack_vector), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "request_retries", .data = &sysctl_dccp_request_retries, From 5d3dac267a7fd0811ec777e76a81f97f5cdcb395 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 039/105] dccp: Initialisation framework for feature negotiation This initialises feature negotiation from two tables, which are initialised from sysctls. As a novel feature, specifics of the implementation (e.g. currently short seqnos and ECN are not supported) are advertised for robustness. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- include/linux/dccp.h | 19 ------------- net/dccp/feat.c | 66 +++++++++++++++++++++++++++++++++++++------- net/dccp/feat.h | 2 +- 3 files changed, 57 insertions(+), 30 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 61734e27abb7..990e97fa1f07 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -369,28 +369,9 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) * Will be used to pass the state from dccp_request_sock to dccp_sock. * * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) - * @dccpms_pending - List of features being negotiated - * @dccpms_conf - */ struct dccp_minisock { __u64 dccpms_sequence_window; - struct list_head dccpms_pending; - struct list_head dccpms_conf; -}; - -struct dccp_opt_conf { - __u8 *dccpoc_val; - __u8 dccpoc_len; -}; - -struct dccp_opt_pend { - struct list_head dccpop_node; - __u8 dccpop_type; - __u8 dccpop_feat; - __u8 *dccpop_val; - __u8 dccpop_len; - int dccpop_conf; - struct dccp_opt_conf *dccpop_sc; }; extern void dccp_minisock_init(struct dccp_minisock *dmsk); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 35a57ab3bb1e..a687740e4420 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1110,24 +1110,70 @@ int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, return 0; /* ignore FN options in all other states */ } +/** + * dccp_feat_init - Seed feature negotiation with host-specific defaults + * This initialises global defaults, depending on the value of the sysctls. + * These can later be overridden by registering changes via setsockopt calls. + * The last link in the chain is finalise_settings, to make sure that between + * here and the start of actual feature negotiation no inconsistencies enter. + * + * All features not appearing below use either defaults or are otherwise + * later adjusted through dccp_feat_finalise_settings(). + */ int dccp_feat_init(struct sock *sk) { - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); + struct list_head *fn = &dccp_sk(sk)->dccps_featneg; + u8 on = 1, off = 0; int rc; + struct { + u8 *val; + u8 len; + } tx, rx; - INIT_LIST_HEAD(&dmsk->dccpms_pending); /* XXX no longer used */ - INIT_LIST_HEAD(&dmsk->dccpms_conf); /* XXX no longer used */ + /* Non-negotiable (NN) features */ + rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, + sysctl_dccp_feat_sequence_window); + if (rc) + return rc; - /* Ack ratio */ - rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0, - dp->dccps_l_ack_ratio); -out: + /* Server-priority (SP) features */ + + /* Advertise that short seqnos are not supported (7.6.1) */ + rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1); + if (rc) + return rc; + + /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */ + rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1); + if (rc) + return rc; + + /* + * We advertise the available list of CCIDs and reorder according to + * preferences, to avoid failure resulting from negotiating different + * singleton values (which always leads to failure). + * These settings can still (later) be overridden via sockopts. + */ + if (ccid_get_builtin_ccids(&tx.val, &tx.len) || + ccid_get_builtin_ccids(&rx.val, &rx.len)) + return -ENOBUFS; + + if (!dccp_feat_prefer(sysctl_dccp_feat_tx_ccid, tx.val, tx.len) || + !dccp_feat_prefer(sysctl_dccp_feat_rx_ccid, rx.val, rx.len)) + goto free_ccid_lists; + + rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); + if (rc) + goto free_ccid_lists; + + rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len); + +free_ccid_lists: + kfree(tx.val); + kfree(rx.val); return rc; } -EXPORT_SYMBOL_GPL(dccp_feat_init); - int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) { struct dccp_sock *dp = dccp_sk(sk); diff --git a/net/dccp/feat.h b/net/dccp/feat.h index 177e9c39ea9f..f73b47abca93 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -112,13 +112,13 @@ static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) #define dccp_feat_debug(type, feat, val) #endif /* CONFIG_IP_DCCP_DEBUG */ +extern int dccp_feat_init(struct sock *sk); extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, u8 const *list, u8 len); extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, u8 mand, u8 opt, u8 feat, u8 *val, u8 len); extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); -extern int dccp_feat_init(struct sock *sk); /* * Encoding variable-length options and their maximum length. From 09856c108956c99088ead9267ccbd1dab77f7043 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 040/105] dccp: Auto-load (when supported) CCID plugins for negotiation This adds auto-loading of CCIDs (when module loading is enabled) for the purpose of feature negotiation. The problem with loading the CCIDs at the end of feature negotiation is that this would happen in software interrupt context. Besides, if the host advertises CCIDs during negotiation, it should have them ready to use, in case an agreeing peer wants to use it for the connection. Signed-off-by: Gerrit Renker Signed-off-by: Ian McDonald --- net/dccp/ccid.c | 39 +++++++++++++++++++++++++++++---------- net/dccp/ccid.h | 1 + net/dccp/feat.c | 5 +++++ 3 files changed, 35 insertions(+), 10 deletions(-) diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 330372a1a0b6..e3fb52b4f5c6 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -196,22 +196,41 @@ int ccid_unregister(struct ccid_operations *ccid_ops) EXPORT_SYMBOL_GPL(ccid_unregister); +/** + * ccid_request_module - Pre-load CCID module for later use + * This should be called only from process context (e.g. during connection + * setup) and is necessary for later calls to ccid_new (typically in software + * interrupt), so that it has the modules available when they are needed. + */ +static int ccid_request_module(u8 id) +{ + if (!in_atomic()) { + ccids_read_lock(); + if (ccids[id] == NULL) { + ccids_read_unlock(); + return request_module("net-dccp-ccid-%d", id); + } + ccids_read_unlock(); + } + return 0; +} + +int ccid_request_modules(u8 const *ccid_array, u8 array_len) +{ +#ifdef CONFIG_KMOD + while (array_len--) + if (ccid_request_module(ccid_array[array_len])) + return -1; +#endif + return 0; +} + struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp) { struct ccid_operations *ccid_ops; struct ccid *ccid = NULL; ccids_read_lock(); -#ifdef CONFIG_KMOD - if (ccids[id] == NULL) { - /* We only try to load if in process context */ - ccids_read_unlock(); - if (gfp & GFP_ATOMIC) - goto out; - request_module("net-dccp-ccid-%d", id); - ccids_read_lock(); - } -#endif ccid_ops = ccids[id]; if (ccid_ops == NULL) goto out_unlock; diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 18f69423a708..20ba066b2775 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -108,6 +108,7 @@ extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, char __user *, int __user *); +extern int ccid_request_modules(u8 const *ccid_array, u8 array_len); extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index a687740e4420..9a4938092783 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1158,6 +1158,11 @@ int dccp_feat_init(struct sock *sk) ccid_get_builtin_ccids(&rx.val, &rx.len)) return -ENOBUFS; + /* Pre-load all CCID modules that are going to be advertised */ + rc = -EUNATCH; + if (ccid_request_modules(tx.val, tx.len)) + goto free_ccid_lists; + if (!dccp_feat_prefer(sysctl_dccp_feat_tx_ccid, tx.val, tx.len) || !dccp_feat_prefer(sysctl_dccp_feat_rx_ccid, rx.val, rx.len)) goto free_ccid_lists; From 51c7d4fa2675c106a980ddcdbe308b54b5151945 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 041/105] dccp: Implement both feature-local and feature-remote Sequence Window feature This adds full support for local/remote Sequence Window feature, from which the * sequence-number-validity (W) and * acknowledgment-number-validity (W') windows derive as specified in RFC 4340, 7.5.3. Specifically, the following changes are introduced: * integrated new socket fields into dccp_sk; * updated the update_gsr/gss routines with regard to these fields; * updated handler code: the Sequence Window feature is located at the TX side, so the local feature is meant if the handler-rx flag is false; * the initialisation of `rcv_wnd' in reqsk is removed, since - rcv_wnd is not used by the code anywhere; - sequence number checks are not done in the LISTEN state (cf. 7.5.3); - dccp_check_req checks the Ack number validity more rigorously; * the `struct dccp_minisock' became empty and is now removed. Until the handshake completes with activating negotiated values, the local/remote Sequence-Window values are undefined and thus can not reliably be estimated. This issue is addressed in a separate patch. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- Documentation/networking/dccp.txt | 3 ++- include/linux/dccp.h | 24 ++++-------------------- net/dccp/dccp.h | 16 +++++++--------- net/dccp/feat.c | 13 +++++++++++-- net/dccp/minisocks.c | 11 ----------- net/dccp/proto.c | 2 -- 6 files changed, 24 insertions(+), 45 deletions(-) diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 7a3bb1abb830..b132e4a3cf0f 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -141,7 +141,8 @@ rx_ccid = 2 Default CCID for the receiver-sender half-connection; see tx_ccid. seq_window = 100 - The initial sequence window (sec. 7.5.2). + The initial sequence window (sec. 7.5.2) of the sender. This influences + the local ackno validity and the remote seqno validity windows (7.5.1). tx_qlen = 5 The size of the transmit buffer in packets. A value of 0 corresponds diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 990e97fa1f07..7a0502ab383a 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -363,19 +363,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) /* FIXME: for now we're default to 1 but it should really be 0 */ #define DCCPF_INITIAL_SEND_NDP_COUNT 1 -/** - * struct dccp_minisock - Minimal DCCP connection representation - * - * Will be used to pass the state from dccp_request_sock to dccp_sock. - * - * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) - */ -struct dccp_minisock { - __u64 dccpms_sequence_window; -}; - -extern void dccp_minisock_init(struct dccp_minisock *dmsk); - /** * struct dccp_request_sock - represent DCCP-specific connection request * @dreq_inet_rsk: structure inherited from @@ -464,13 +451,14 @@ struct dccp_ackvec; * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo * @dccps_l_ack_ratio - feature-local Ack Ratio * @dccps_r_ack_ratio - feature-remote Ack Ratio + * @dccps_l_seq_win - local Sequence Window (influences ack number validity) + * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) * @dccps_pcslen - sender partial checksum coverage (via sockopt) * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) * @dccps_ndp_count - number of Non Data Packets since last data packet * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) - * @dccps_minisock - associated minisock (accessed via dccp_msk) * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) * @dccps_hc_rx_ackvec - rx half connection ack vector * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) @@ -504,12 +492,13 @@ struct dccp_sock { __u32 dccps_timestamp_time; __u16 dccps_l_ack_ratio; __u16 dccps_r_ack_ratio; + __u64 dccps_l_seq_win:48; + __u64 dccps_r_seq_win:48; __u8 dccps_pcslen:4; __u8 dccps_pcrlen:4; __u8 dccps_send_ndp_count:1; __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; - struct dccp_minisock dccps_minisock; struct list_head dccps_featneg; struct dccp_ackvec *dccps_hc_rx_ackvec; struct ccid *dccps_hc_rx_ccid; @@ -527,11 +516,6 @@ static inline struct dccp_sock *dccp_sk(const struct sock *sk) return (struct dccp_sock *)sk; } -static inline struct dccp_minisock *dccp_msk(const struct sock *sk) -{ - return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock; -} - static inline const char *dccp_role(const struct sock *sk) { switch (dccp_sk(sk)->dccps_role) { diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 3fd16e82c003..6101ecdb85b6 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -409,23 +409,21 @@ static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack, static inline void dccp_update_gsr(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); - const struct dccp_minisock *dmsk = dccp_msk(sk); dp->dccps_gsr = seq; - dccp_set_seqno(&dp->dccps_swl, - dp->dccps_gsr + 1 - (dmsk->dccpms_sequence_window / 4)); - dccp_set_seqno(&dp->dccps_swh, - dp->dccps_gsr + (3 * dmsk->dccpms_sequence_window) / 4); + /* Sequence validity window depends on remote Sequence Window (7.5.1) */ + dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); + dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); } static inline void dccp_update_gss(struct sock *sk, u64 seq) { struct dccp_sock *dp = dccp_sk(sk); - dp->dccps_awh = dp->dccps_gss = seq; - dccp_set_seqno(&dp->dccps_awl, - (dp->dccps_gss - - dccp_msk(sk)->dccpms_sequence_window + 1)); + dp->dccps_gss = seq; + /* Ack validity window depends on local Sequence Window value (7.5.1) */ + dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); + dp->dccps_awh = dp->dccps_gss; } static inline int dccp_ack_pending(const struct sock *sk) diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 9a4938092783..843465999854 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -52,8 +52,17 @@ static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) { - if (!rx) - dccp_msk(sk)->dccpms_sequence_window = seq_win; + struct dccp_sock *dp = dccp_sk(sk); + + if (rx) { + dp->dccps_r_seq_win = seq_win; + /* propagate changes to update SWL/SWH */ + dccp_update_gsr(sk, dp->dccps_gsr); + } else { + dp->dccps_l_seq_win = seq_win; + /* propagate changes to update AWL */ + dccp_update_gss(sk, dp->dccps_gss); + } return 0; } diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 0ebf8ebcf3de..0ecb19c5e8ce 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -42,11 +42,6 @@ struct inet_timewait_death_row dccp_death_row = { EXPORT_SYMBOL_GPL(dccp_death_row); -void dccp_minisock_init(struct dccp_minisock *dmsk) -{ - dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; -} - void dccp_time_wait(struct sock *sk, int state, int timeo) { struct inet_timewait_sock *tw = NULL; @@ -110,7 +105,6 @@ struct sock *dccp_create_openreq_child(struct sock *sk, struct dccp_request_sock *dreq = dccp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); struct dccp_sock *newdp = dccp_sk(newsk); - struct dccp_minisock *newdmsk = dccp_msk(newsk); newdp->dccps_role = DCCP_ROLE_SERVER; newdp->dccps_hc_rx_ackvec = NULL; @@ -128,10 +122,6 @@ struct sock *dccp_create_openreq_child(struct sock *sk, * Initialize S.GAR := S.ISS * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies */ - - /* See dccp_v4_conn_request */ - newdmsk->dccpms_sequence_window = req->rcv_wnd; - newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss; dccp_update_gss(newsk, dreq->dreq_iss); @@ -289,7 +279,6 @@ int dccp_reqsk_init(struct request_sock *req, inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport; inet_rsk(req)->acked = 0; - req->rcv_wnd = sysctl_dccp_feat_sequence_window; dreq->dreq_timestamp_echo = 0; /* inherit feature negotiation options from listening socket */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 775eaa3d0c49..392a5d822b33 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -180,8 +180,6 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) struct dccp_sock *dp = dccp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - dccp_minisock_init(&dp->dccps_minisock); - icsk->icsk_rto = DCCP_TIMEOUT_INIT; icsk->icsk_syn_retries = sysctl_dccp_request_retries; sk->sk_state = DCCP_CLOSED; From 0a4822679d94e2b0117aeead06a19fad59533905 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 042/105] dccp: Initialisation and type-checking of feature sysctls This patch takes care of initialising and type-checking sysctls related to feature negotiation. Type checking is important since some of the sysctls now directly act on the feature-negotiation process. The sysctls are initialised with the known default values for each feature. For the type-checking the value constraints from RFC 4340 are used: * Sequence Window uses the specified Wmin=32, the maximum is ulong (4 bytes), tested and confirmed that it works up to 4294967295 - for Gbps speed; * Ack Ratio is between 0 .. 0xffff (2-byte unsigned integer); * CCIDs are between 0 .. 255; * request_retries, retries1, retries2 also between 0..255 for good measure; * tx_qlen is checked to be non-negative; * sync_ratelimit remains as before. Further changes: ---------------- Performed s@sysctl_dccp_feat@sysctl_dccp@g since the sysctls are now in feat.c. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- include/linux/dccp.h | 8 -------- net/dccp/dccp.h | 3 --- net/dccp/feat.c | 11 ++++++++--- net/dccp/feat.h | 8 ++++++++ net/dccp/options.c | 4 ---- net/dccp/sysctl.c | 43 ++++++++++++++++++++++++++++++------------- 6 files changed, 46 insertions(+), 31 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 7a0502ab383a..7434a8353e23 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -355,14 +355,6 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) return __dccp_hdr_len(dccp_hdr(skb)); } - -/* initial values for each feature */ -#define DCCPF_INITIAL_SEQUENCE_WINDOW 100 -#define DCCPF_INITIAL_ACK_RATIO 2 -#define DCCPF_INITIAL_CCID DCCPC_CCID2 -/* FIXME: for now we're default to 1 but it should really be 0 */ -#define DCCPF_INITIAL_SEND_NDP_COUNT 1 - /** * struct dccp_request_sock - represent DCCP-specific connection request * @dreq_inet_rsk: structure inherited from diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 6101ecdb85b6..4f681f1a16cc 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -95,9 +95,6 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); extern int sysctl_dccp_request_retries; extern int sysctl_dccp_retries1; extern int sysctl_dccp_retries2; -extern int sysctl_dccp_feat_sequence_window; -extern int sysctl_dccp_feat_rx_ccid; -extern int sysctl_dccp_feat_tx_ccid; extern int sysctl_dccp_tx_qlen; extern int sysctl_dccp_sync_ratelimit; diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 843465999854..4c95cbdb0e01 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -26,6 +26,11 @@ #include "ccid.h" #include "feat.h" +/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */ +unsigned long sysctl_dccp_sequence_window __read_mostly = 100; +int sysctl_dccp_rx_ccid __read_mostly = 2, + sysctl_dccp_tx_ccid __read_mostly = 2; + /* * Feature activation handlers. * @@ -1141,7 +1146,7 @@ int dccp_feat_init(struct sock *sk) /* Non-negotiable (NN) features */ rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0, - sysctl_dccp_feat_sequence_window); + sysctl_dccp_sequence_window); if (rc) return rc; @@ -1172,8 +1177,8 @@ int dccp_feat_init(struct sock *sk) if (ccid_request_modules(tx.val, tx.len)) goto free_ccid_lists; - if (!dccp_feat_prefer(sysctl_dccp_feat_tx_ccid, tx.val, tx.len) || - !dccp_feat_prefer(sysctl_dccp_feat_rx_ccid, rx.val, rx.len)) + if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) || + !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len)) goto free_ccid_lists; rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len); diff --git a/net/dccp/feat.h b/net/dccp/feat.h index f73b47abca93..f8456bca4eeb 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -99,6 +99,13 @@ struct ccid_dependency { u8 val; }; +/* + * Sysctls to seed defaults for feature negotiation + */ +extern unsigned long sysctl_dccp_sequence_window; +extern int sysctl_dccp_rx_ccid; +extern int sysctl_dccp_tx_ccid; + #ifdef CONFIG_IP_DCCP_DEBUG extern const char *dccp_feat_typename(const u8 type); extern const char *dccp_feat_name(const u8 feat); @@ -113,6 +120,7 @@ static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) #endif /* CONFIG_IP_DCCP_DEBUG */ extern int dccp_feat_init(struct sock *sk); +extern void dccp_feat_initialise_sysctls(void); extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, u8 const *list, u8 len); extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); diff --git a/net/dccp/options.c b/net/dccp/options.c index aca309e16632..5906e96eedde 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -23,10 +23,6 @@ #include "dccp.h" #include "feat.h" -int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; -int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; -int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; - u64 dccp_decode_value_var(const u8 *bf, const u8 len) { u64 value = 0; diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index 018e210875e1..a5a1856234e7 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -18,55 +18,72 @@ #error This file should not be compiled without CONFIG_SYSCTL defined #endif +/* Boundary values */ +static int zero = 0, + u8_max = 0xFF; +static unsigned long seqw_min = 32; + static struct ctl_table dccp_default_table[] = { { .procname = "seq_window", - .data = &sysctl_dccp_feat_sequence_window, - .maxlen = sizeof(sysctl_dccp_feat_sequence_window), + .data = &sysctl_dccp_sequence_window, + .maxlen = sizeof(sysctl_dccp_sequence_window), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */ }, { .procname = "rx_ccid", - .data = &sysctl_dccp_feat_rx_ccid, - .maxlen = sizeof(sysctl_dccp_feat_rx_ccid), + .data = &sysctl_dccp_rx_ccid, + .maxlen = sizeof(sysctl_dccp_rx_ccid), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, /* RFC 4340, 10. */ }, { .procname = "tx_ccid", - .data = &sysctl_dccp_feat_tx_ccid, - .maxlen = sizeof(sysctl_dccp_feat_tx_ccid), + .data = &sysctl_dccp_tx_ccid, + .maxlen = sizeof(sysctl_dccp_tx_ccid), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, /* RFC 4340, 10. */ }, { .procname = "request_retries", .data = &sysctl_dccp_request_retries, .maxlen = sizeof(sysctl_dccp_request_retries), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, }, { .procname = "retries1", .data = &sysctl_dccp_retries1, .maxlen = sizeof(sysctl_dccp_retries1), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, }, { .procname = "retries2", .data = &sysctl_dccp_retries2, .maxlen = sizeof(sysctl_dccp_retries2), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &u8_max, }, { .procname = "tx_qlen", .data = &sysctl_dccp_tx_qlen, .maxlen = sizeof(sysctl_dccp_tx_qlen), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, }, { .procname = "sync_ratelimit", From 76f738a7950b559a23ab3c692c99a02f35a54f7f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 043/105] dccp: Debugging functions for feature negotiation Since all feature-negotiation processing now takes place in feat.c, functions for producing verbose debugging output are concentrated there. New functions to print out values, entry records, and options are provided, and also a macro is defined to not always have the function name in the output line. Thanks a lot to Wei Yongjun and Giuseppe Galeota for help with errors in an earlier revision of this patch. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/dccp.h | 2 + net/dccp/feat.c | 153 +++++++++++++++++++++++++++++++-------------- net/dccp/feat.h | 13 ---- net/dccp/options.c | 4 -- 4 files changed, 109 insertions(+), 63 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 4f681f1a16cc..94ae6d41d724 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -42,9 +42,11 @@ extern int dccp_debug; #define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a) #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) +#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) #else #define dccp_pr_debug(format, a...) #define dccp_pr_debug_cat(format, a...) +#define dccp_debug(format, a...) #endif extern struct inet_hashinfo dccp_hashinfo; diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 4c95cbdb0e01..3abacad0b224 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -204,6 +204,100 @@ static int dccp_feat_default_value(u8 feat_num) return idx < 0 ? : dccp_feat_table[idx].default_value; } +/* + * Debugging and verbose-printing section + */ +static const char *dccp_feat_fname(const u8 feat) +{ + static const char *feature_names[] = { + [DCCPF_RESERVED] = "Reserved", + [DCCPF_CCID] = "CCID", + [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", + [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", + [DCCPF_ECN_INCAPABLE] = "ECN Incapable", + [DCCPF_ACK_RATIO] = "Ack Ratio", + [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", + [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", + [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", + [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", + }; + if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) + return feature_names[DCCPF_RESERVED]; + + if (feat == DCCPF_SEND_LEV_RATE) + return "Send Loss Event Rate"; + if (feat >= DCCPF_MIN_CCID_SPECIFIC) + return "CCID-specific"; + + return feature_names[feat]; +} + +static const char *dccp_feat_sname[] = { "DEFAULT", "INITIALISING", "CHANGING", + "UNSTABLE", "STABLE" }; + +#ifdef CONFIG_IP_DCCP_DEBUG +static const char *dccp_feat_oname(const u8 opt) +{ + switch (opt) { + case DCCPO_CHANGE_L: return "Change_L"; + case DCCPO_CONFIRM_L: return "Confirm_L"; + case DCCPO_CHANGE_R: return "Change_R"; + case DCCPO_CONFIRM_R: return "Confirm_R"; + } + return NULL; +} + +static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val) +{ + u8 i, type = dccp_feat_type(feat_num); + + if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL)) + dccp_pr_debug_cat("(NULL)"); + else if (type == FEAT_SP) + for (i = 0; i < val->sp.len; i++) + dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]); + else if (type == FEAT_NN) + dccp_pr_debug_cat("%llu", (unsigned long long)val->nn); + else + dccp_pr_debug_cat("unknown type %u", type); +} + +static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len) +{ + u8 type = dccp_feat_type(feat_num); + dccp_feat_val fval = { .sp.vec = list, .sp.len = len }; + + if (type == FEAT_NN) + fval.nn = dccp_decode_value_var(list, len); + dccp_feat_printval(feat_num, &fval); +} + +static void dccp_feat_print_entry(struct dccp_feat_entry const *entry) +{ + dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote", + dccp_feat_fname(entry->feat_num)); + dccp_feat_printval(entry->feat_num, &entry->val); + dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state], + entry->needs_confirm ? "(Confirm pending)" : ""); +} + +#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \ + dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\ + dccp_feat_printvals(feat, val, len); \ + dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0) + +#define dccp_feat_print_fnlist(fn_list) { \ + const struct dccp_feat_entry *___entry; \ + \ + dccp_pr_debug("List Dump:\n"); \ + list_for_each_entry(___entry, fn_list, node) \ + dccp_feat_print_entry(___entry); \ +} +#else /* ! CONFIG_IP_DCCP_DEBUG */ +#define dccp_feat_print_opt(opt, feat, val, len, mandatory) +#define dccp_feat_print_fnlist(fn_list) +#endif + static int __dccp_feat_activate(struct sock *sk, const int idx, const bool is_local, dccp_feat_val const *fval) { @@ -236,6 +330,10 @@ static int __dccp_feat_activate(struct sock *sk, const int idx, /* Location is RX if this is a local-RX or remote-TX feature */ rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); + dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX", + dccp_feat_fname(dccp_feat_table[idx].feat_num), + fval ? "" : "default ", (unsigned long long)val); + return dccp_feat_table[idx].activation_hdlr(sk, val, rx); } @@ -539,6 +637,7 @@ int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, return -1; } } + dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0); if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) return -1; @@ -792,6 +891,7 @@ int dccp_feat_finalise_settings(struct dccp_sock *dp) while (i--) if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) return -1; + dccp_feat_print_fnlist(fn); return 0; } @@ -910,6 +1010,8 @@ static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ goto unknown_feature_or_value; + dccp_feat_print_opt(opt, feat, val, len, is_mandatory); + /* * Negotiation of NN features: Change R is invalid, so there is no * simultaneous negotiation; hence we do not look up in the list. @@ -1015,6 +1117,8 @@ static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, const bool local = (opt == DCCPO_CONFIRM_R); struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); + dccp_feat_print_opt(opt, feat, val, len, is_mandatory); + if (entry == NULL) { /* nothing queued: ignore or handle error */ if (is_mandatory && type == FEAT_UNKNOWN) return DCCP_RESET_CODE_MANDATORY_ERROR; @@ -1217,9 +1321,10 @@ int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) goto activation_failed; } if (cur->state != FEAT_STABLE) { - DCCP_CRIT("Negotiation of %s %u failed in state %u", + DCCP_CRIT("Negotiation of %s %s failed in state %s", cur->is_local ? "local" : "remote", - cur->feat_num, cur->state); + dccp_feat_fname(cur->feat_num), + dccp_feat_sname[cur->state]); goto activation_failed; } fvals[idx][cur->is_local] = &cur->val; @@ -1260,47 +1365,3 @@ activation_failed: dp->dccps_hc_rx_ackvec = NULL; return -1; } - -#ifdef CONFIG_IP_DCCP_DEBUG -const char *dccp_feat_typename(const u8 type) -{ - switch(type) { - case DCCPO_CHANGE_L: return("ChangeL"); - case DCCPO_CONFIRM_L: return("ConfirmL"); - case DCCPO_CHANGE_R: return("ChangeR"); - case DCCPO_CONFIRM_R: return("ConfirmR"); - /* the following case must not appear in feature negotation */ - default: dccp_pr_debug("unknown type %d [BUG!]\n", type); - } - return NULL; -} - -EXPORT_SYMBOL_GPL(dccp_feat_typename); - -const char *dccp_feat_name(const u8 feat) -{ - static const char *feature_names[] = { - [DCCPF_RESERVED] = "Reserved", - [DCCPF_CCID] = "CCID", - [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos", - [DCCPF_SEQUENCE_WINDOW] = "Sequence Window", - [DCCPF_ECN_INCAPABLE] = "ECN Incapable", - [DCCPF_ACK_RATIO] = "Ack Ratio", - [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector", - [DCCPF_SEND_NDP_COUNT] = "Send NDP Count", - [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage", - [DCCPF_DATA_CHECKSUM] = "Send Data Checksum", - }; - if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) - return feature_names[DCCPF_RESERVED]; - - if (feat == DCCPF_SEND_LEV_RATE) - return "Send Loss Event Rate"; - if (feat >= DCCPF_MIN_CCID_SPECIFIC) - return "CCID-specific"; - - return feature_names[feat]; -} - -EXPORT_SYMBOL_GPL(dccp_feat_name); -#endif /* CONFIG_IP_DCCP_DEBUG */ diff --git a/net/dccp/feat.h b/net/dccp/feat.h index f8456bca4eeb..2217066e22d7 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -106,19 +106,6 @@ extern unsigned long sysctl_dccp_sequence_window; extern int sysctl_dccp_rx_ccid; extern int sysctl_dccp_tx_ccid; -#ifdef CONFIG_IP_DCCP_DEBUG -extern const char *dccp_feat_typename(const u8 type); -extern const char *dccp_feat_name(const u8 feat); - -static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) -{ - dccp_pr_debug("%s(%s (%d), %d)\n", dccp_feat_typename(type), - dccp_feat_name(feat), feat, val); -} -#else -#define dccp_feat_debug(type, feat, val) -#endif /* CONFIG_IP_DCCP_DEBUG */ - extern int dccp_feat_init(struct sock *sk); extern void dccp_feat_initialise_sysctls(void); extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, diff --git a/net/dccp/options.c b/net/dccp/options.c index 5906e96eedde..fd51cc70c63e 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -498,10 +498,6 @@ int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, *to++ = *val; if (len) memcpy(to, val, len); - - dccp_pr_debug("%s(%s (%d), ...), length %d\n", - dccp_feat_typename(type), - dccp_feat_name(feat), feat, len); return 0; } From 624a965a93610152b10c73d050ed44812efa8abe Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 044/105] dccp: Support for the exchange of NN options in established state In contrast to static feature negotiation at the begin of a connection, which establishes the capabilities of both endpoints, this patch introduces support for dynamic exchange of feature negotiation options. Such a dynamic exchange is necessary in at least two cases: * CCID-2's Ack Ratio (RFC 4341, 6.1.2) which changes during the connection; * Sequence Window values that, as per RFC 4340, 7.5.2, should be sent "as as the connection progresses". Both are NN (non-negotiable) features. Hence dynamic feature "negotiation" is distinguished from static/pre-connection negotiation by the following: * no new capabilities are negotiated (those that matter for the connection are negotiated prior to setting up the connection, comparable to SIP); * features must be understood by each endpoint: as per RFC 4340, 6.4, Sequence Window is "Req'd" and Ack Ratio must be understood when CCID-2 is used as per the note underneath Table 4. These characteristics are reflected in the implementation: * only NN options can be exchanged after connection setup; * NN options are activated directly after validating them. The rationale is that a peer must accept every valid NN value (RFC 4340, 6.3.2), hence it will either accept the value and send a "Confirm R", or it will send an empty Confirm (which will reset the connection according to FN rules). * An Ack is scheduled directly after activation to accelerate communicating the update to the peer. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/dccp.h | 1 + net/dccp/feat.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 94ae6d41d724..0a07b2e24e62 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -436,6 +436,7 @@ static inline int dccp_ack_pending(const struct sock *sk) inet_csk_ack_scheduled(sk); } +extern int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val); extern int dccp_feat_finalise_settings(struct dccp_sock *dp); extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 3abacad0b224..5be8b85ac74b 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -13,6 +13,7 @@ * ----------- * o Feature negotiation is coordinated with connection setup (as in TCP), wild * changes of parameters of an established connection are not supported. + * o Changing NN values (Ack Ratio only) is supported in state OPEN/PARTOPEN. * o All currently known SP features have 1-byte quantities. If in the future * extensions of RFCs 4340..42 define features with item lengths larger than * one byte, a feature-specific extension of the code will be required. @@ -337,6 +338,20 @@ static int __dccp_feat_activate(struct sock *sk, const int idx, return dccp_feat_table[idx].activation_hdlr(sk, val, rx); } +/** + * dccp_feat_activate - Activate feature value on socket + * @sk: fully connected DCCP socket (after handshake is complete) + * @feat_num: feature to activate, one of %dccp_feature_numbers + * @local: whether local (1) or remote (0) @feat_num is meant + * @fval: the value (SP or NN) to activate, or NULL to use the default value + * For general use this function is preferable over __dccp_feat_activate(). + */ +static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local, + dccp_feat_val const *fval) +{ + return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval); +} + /* Test for "Req'd" feature (RFC 4340, 6.4) */ static inline int dccp_feat_must_be_understood(u8 feat_num) { @@ -734,6 +749,48 @@ int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val) return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val); } +/** + * dccp_feat_signal_nn_change - Update NN values for an established connection + * @sk: DCCP socket of an established connection + * @feat: NN feature number from %dccp_feature_numbers + * @nn_val: the new value to use + * This function is used to communicate NN updates out-of-band. The difference + * to feature negotiation during connection setup is that values are activated + * immediately after validation, i.e. we don't wait for the Confirm: either the + * value is accepted by the peer (and then the waiting is futile), or it is not + * (Reset or empty Confirm). We don't accept empty Confirms - transmitted values + * are validated, and the peer "MUST accept any valid value" (RFC 4340, 6.3.2). + */ +int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val) +{ + struct list_head *fn = &dccp_sk(sk)->dccps_featneg; + dccp_feat_val fval = { .nn = nn_val }; + struct dccp_feat_entry *entry; + + if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN) + return 0; + + if (dccp_feat_type(feat) != FEAT_NN || + !dccp_feat_is_valid_nn_val(feat, nn_val)) + return -EINVAL; + + entry = dccp_feat_list_lookup(fn, feat, 1); + if (entry != NULL) { + dccp_pr_debug("Ignoring %llu, entry %llu exists in state %s\n", + (unsigned long long)nn_val, + (unsigned long long)entry->val.nn, + dccp_feat_sname[entry->state]); + return 0; + } + + if (dccp_feat_activate(sk, feat, 1, &fval)) + return -EADV; + + inet_csk_schedule_ack(sk); + return dccp_feat_push_change(fn, feat, 1, 0, &fval); +} +EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change); + /* * Tracking features whose value depend on the choice of CCID * From 4861a354430d2ea36847ef88086c7449b4f385b6 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 045/105] dccp: Support for exchanging of NN options in established state This patch provides support for the reception of NN options in (PART)OPEN state. It is a combination of change_recv() and confirm_recv(), specifically geared towards receiving the `fast-path' NN options. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/feat.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 5be8b85ac74b..c847c80d1b97 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1246,6 +1246,93 @@ confirmation_failed: : DCCP_RESET_CODE_OPTION_ERROR; } +/** + * dccp_feat_handle_nn_established - Fast-path reception of NN options + * @sk: socket of an established DCCP connection + * @mandatory: whether @opt was preceded by a Mandatory option + * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only) + * @feat: NN number, one of %dccp_feature_numbers + * @val: NN value + * @len: length of @val in bytes + * This function combines the functionality of change_recv/confirm_recv, with + * the following differences (reset codes are the same): + * - cleanup after receiving the Confirm; + * - values are directly activated after successful parsing; + * - deliberately restricted to NN features. + * The restriction to NN features is essential since SP features can have non- + * predictable outcomes (depending on the remote configuration), and are inter- + * dependent (CCIDs for instance cause further dependencies). + */ +static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt, + u8 feat, u8 *val, u8 len) +{ + struct list_head *fn = &dccp_sk(sk)->dccps_featneg; + const bool local = (opt == DCCPO_CONFIRM_R); + struct dccp_feat_entry *entry; + u8 type = dccp_feat_type(feat); + dccp_feat_val fval; + + dccp_feat_print_opt(opt, feat, val, len, mandatory); + + /* Ignore non-mandatory unknown and non-NN features */ + if (type == FEAT_UNKNOWN) { + if (local && !mandatory) + return 0; + goto fast_path_unknown; + } else if (type != FEAT_NN) { + return 0; + } + + /* + * We don't accept empty Confirms, since in fast-path feature + * negotiation the values are enabled immediately after sending + * the Change option. + * Empty Changes on the other hand are invalid (RFC 4340, 6.1). + */ + if (len == 0 || len > sizeof(fval.nn)) + goto fast_path_unknown; + + if (opt == DCCPO_CHANGE_L) { + fval.nn = dccp_decode_value_var(val, len); + if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) + goto fast_path_unknown; + + if (dccp_feat_push_confirm(fn, feat, local, &fval) || + dccp_feat_activate(sk, feat, local, &fval)) + return DCCP_RESET_CODE_TOO_BUSY; + + /* set the `Ack Pending' flag to piggyback a Confirm */ + inet_csk_schedule_ack(sk); + + } else if (opt == DCCPO_CONFIRM_R) { + entry = dccp_feat_list_lookup(fn, feat, local); + if (entry == NULL || entry->state != FEAT_CHANGING) + return 0; + + fval.nn = dccp_decode_value_var(val, len); + if (fval.nn != entry->val.nn) { + DCCP_WARN("Bogus Confirm for non-existing value\n"); + goto fast_path_failed; + } + + /* It has been confirmed - so remove the entry */ + dccp_feat_list_pop(entry); + + } else { + DCCP_WARN("Received illegal option %u\n", opt); + goto fast_path_failed; + } + return 0; + +fast_path_unknown: + if (!mandatory) + return dccp_push_empty_confirm(fn, feat, local); + +fast_path_failed: + return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR + : DCCP_RESET_CODE_OPTION_ERROR; +} + /** * dccp_feat_parse_options - Process Feature-Negotiation Options * @sk: for general use and used by the client during connection setup @@ -1281,6 +1368,15 @@ int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, return dccp_feat_confirm_recv(fn, mandatory, opt, feat, val, len, server); } + break; + /* + * Support for exchanging NN options on an established connection + * This is currently restricted to Ack Ratio (RFC 4341, 6.1.2) + */ + case DCCP_OPEN: + case DCCP_PARTOPEN: + return dccp_feat_handle_nn_established(sk, mandatory, opt, feat, + val, len); } return 0; /* ignore FN options in all other states */ } From 2faae5587f692fd5c79856ca4c4b90944ee0472a Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 046/105] dccp ccid-2: Use feature-negotiation to report Ack Ratio changes This uses the new feature-negotiation framework to signal Ack Ratio changes, as required by RFC 4341, sec. 6.1.2. This raises some problems for CCID-2 since it can at the moment not cope gracefully with Ack Ratio of e.g. 2. A FIXME has thus been added which reverts to the existing policy of bypassing the Ack Ratio sysctl. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/feat.c | 12 ++++++++++++ net/dccp/proto.c | 1 - 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/net/dccp/feat.c b/net/dccp/feat.c index c847c80d1b97..f94c7c9d1a7f 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -74,6 +74,18 @@ static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) { +#ifndef __CCID2_COPES_GRACEFULLY_WITH_DYNAMIC_ACK_RATIO_UPDATES__ + /* + * FIXME: This is required until several problems in the CCID-2 code are + * resolved. The CCID-2 code currently does not cope well; using dynamic + * Ack Ratios greater than 1 caused instabilities. These were manifest + * in hangups and long RTO timeouts (1...3 seconds). Until this has been + * stabilised, it is safer not to activate dynamic Ack Ratio changes. + */ + dccp_pr_debug("Not changing %s Ack Ratio from 1 to %u\n", + rx ? "RX" : "TX", (u16)ratio); + ratio = 1; +#endif if (rx) dccp_sk(sk)->dccps_r_ack_ratio = ratio; else diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 392a5d822b33..11905e0cf8f7 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -189,7 +189,6 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) dp->dccps_rate_last = jiffies; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; - dp->dccps_l_ack_ratio = dp->dccps_r_ack_ratio = 1; dccp_init_xmit_timers(sk); From 55ebe3ab2d504bd3f3eeade0262826210019abda Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 047/105] dccp: Leave headroom for options when calculating the MPS The Maximum Packet Size (MPS) is of interest for applications which want to transfer data, so it is only relevant to the data transfer phase of a connection (unless one wants to send data on the DCCP-Request, but that is not considered here). The strategy chosen to deal with this requirement is to leave room for only such options that may appear on data packets. A special consideration applies to Ack Vectors: this is purely guesswork, since these can have any length between 3 and 1020 bytes. The strategy chosen here is to subtract a configurable minimum, the value of 16 bytes (2 bytes for type/length plus 14 Ack Vector cells) has been found by experimentatation. If people experience this as too much or too little, this could later be turned into a Kconfig option. There are currently no CCID-specific header options which may appear on data packets, hence it is not necessary to define a corresponding CCID field. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/ackvec.h | 3 +++ net/dccp/output.c | 22 ++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index 4ccee030524e..1c10814fdf72 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -20,6 +20,9 @@ /* We can spread an ack vector across multiple options */ #define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2) +/* Estimated minimum average Ack Vector length - used for updating MPS */ +#define DCCPAV_MIN_OPTLEN 16 + #define DCCP_ACKVEC_STATE_RECEIVED 0 #define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) #define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6) diff --git a/net/dccp/output.c b/net/dccp/output.c index 19a93d5433a7..0aab919dafe6 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -161,21 +161,27 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); u32 ccmps = dccp_determine_ccmps(dp); - int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; + u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; /* Account for header lengths and IPv4/v6 option overhead */ cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); /* - * FIXME: this should come from the CCID infrastructure, where, say, - * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets - * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED - * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to - * make it a multiple of 4 + * Leave enough headroom for common DCCP header options. + * This only considers options which may appear on DCCP-Data packets, as + * per table 3 in RFC 4340, 5.8. When running out of space for other + * options (eg. Ack Vector which can take up to 255 bytes), it is better + * to schedule a separate Ack. Thus we leave headroom for the following: + * - 1 byte for Slow Receiver (11.6) + * - 6 bytes for Timestamp (13.1) + * - 10 bytes for Timestamp Echo (13.3) + * - 8 bytes for NDP count (7.7, when activated) + * - 6 bytes for Data Checksum (9.3) + * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled) */ - - cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; + cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 + + (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4); /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; From 88ddac513a4e7e04234214b600401ec22abfbb46 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 048/105] dccp: Special case of the MPS for client-PARTOPEN with DataAcks To increase robustness, it is necessary to resend Confirm feature-negotiation options, even though the RFC does not mandate it. But feature negotiation options can take (much) more room than the options on common DataAck packets. Instead of reducing the MPS always for a case which only applies to the three messages send during initial handshake, this patch devises a special case: if the payload length of the DataAck in PARTOPEN is too large, an Ack is sent to carry the options, and the feature-negotiation list is then flushed. This means that the server gets two Acks for one Response. If both Acks get lost, it is probably better to restart the connection anyway and devising yet another special-case does not seem worth the extra complexity. The patch (over-)estimates the expected overhead to be 32*4 bytes -- commonly seen values were 20-90 bytes for initial feature-negotiation options. It uses sizeof(u32) to mean "aligned units of 4 bytes". For consistency, another use of sizeof is modified. Signed-off-by: Gerrit Renker --- net/dccp/dccp.h | 5 ++++- net/dccp/output.c | 15 ++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 0a07b2e24e62..c7370de4c4d5 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -63,11 +63,14 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields * Hence a safe upper bound for the maximum option length is 1020-28 = 992 */ -#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int)) +#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t)) #define DCCP_MAX_PACKET_HDR 28 #define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) #define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) +/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */ +#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t)) + #define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT * state, about 60 seconds */ diff --git a/net/dccp/output.c b/net/dccp/output.c index 0aab919dafe6..9888f61f9b0b 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -276,7 +276,20 @@ void dccp_write_xmit(struct sock *sk, int block) const int len = skb->len; if (sk->sk_state == DCCP_PARTOPEN) { - /* See 8.1.5. Handshake Completion */ + const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; + /* + * See 8.1.5 - Handshake Completion. + * + * For robustness we resend Confirm options until the client has + * entered OPEN. During the initial feature negotiation, the MPS + * is smaller than usual, reduced by the Change/Confirm options. + */ + if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { + DCCP_WARN("Payload too large (%d) for featneg.\n", len); + dccp_send_ack(sk); + dccp_feat_list_purge(&dp->dccps_featneg); + } + inet_csk_schedule_ack(sk); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, inet_csk(sk)->icsk_rto, From 1fb87509606cb19f5f603e54c28af7da149049f3 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 049/105] dccp ccid-2: Remove ccid2hc{tx,rx}_ prefixes This patch fixes two problems caused by the ubiquitous long "hctx->ccid2htx_" and "hcrx->ccid2hcrx_" prefixes: * code becomes hard to read; * multiple-line statements are almost inevitable even for simple expressions; The prefixes are not really necessary (compare with "struct tcp_sock"). There had been previous discussion of this on dccp@vger, but so far this was not followed up (most people agreed that the prefixes are too long). Signed-off-by: Gerrit Renker Signed-off-by: Leandro Melo de Sales --- net/dccp/ccids/ccid2.c | 274 ++++++++++++++++++++--------------------- net/dccp/ccids/ccid2.h | 48 ++++---- 2 files changed, 159 insertions(+), 163 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index c9ea19a4d85e..9728bbf0acea 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -39,16 +39,16 @@ static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx) { int len = 0; int pipe = 0; - struct ccid2_seq *seqp = hctx->ccid2hctx_seqh; + struct ccid2_seq *seqp = hctx->seqh; /* there is data in the chain */ - if (seqp != hctx->ccid2hctx_seqt) { + if (seqp != hctx->seqt) { seqp = seqp->ccid2s_prev; len++; if (!seqp->ccid2s_acked) pipe++; - while (seqp != hctx->ccid2hctx_seqt) { + while (seqp != hctx->seqt) { struct ccid2_seq *prev = seqp->ccid2s_prev; len++; @@ -65,16 +65,16 @@ static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx) } } - BUG_ON(pipe != hctx->ccid2hctx_pipe); + BUG_ON(pipe != hctx->pipe); ccid2_pr_debug("len of chain=%d\n", len); do { seqp = seqp->ccid2s_prev; len++; - } while (seqp != hctx->ccid2hctx_seqh); + } while (seqp != hctx->seqh); ccid2_pr_debug("total len=%d\n", len); - BUG_ON(len != hctx->ccid2hctx_seqbufc * CCID2_SEQBUF_LEN); + BUG_ON(len != hctx->seqbufc * CCID2_SEQBUF_LEN); } #else #define ccid2_pr_debug(format, a...) @@ -87,8 +87,7 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) int i; /* check if we have space to preserve the pointer to the buffer */ - if (hctx->ccid2hctx_seqbufc >= (sizeof(hctx->ccid2hctx_seqbuf) / - sizeof(struct ccid2_seq*))) + if (hctx->seqbufc >= sizeof(hctx->seqbuf) / sizeof(struct ccid2_seq *)) return -ENOMEM; /* allocate buffer and initialize linked list */ @@ -104,20 +103,20 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; /* This is the first allocation. Initiate the head and tail. */ - if (hctx->ccid2hctx_seqbufc == 0) - hctx->ccid2hctx_seqh = hctx->ccid2hctx_seqt = seqp; + if (hctx->seqbufc == 0) + hctx->seqh = hctx->seqt = seqp; else { /* link the existing list with the one we just created */ - hctx->ccid2hctx_seqh->ccid2s_next = seqp; - seqp->ccid2s_prev = hctx->ccid2hctx_seqh; + hctx->seqh->ccid2s_next = seqp; + seqp->ccid2s_prev = hctx->seqh; - hctx->ccid2hctx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; - seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->ccid2hctx_seqt; + hctx->seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1]; + seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hctx->seqt; } /* store the original pointer to the buffer so we can free it */ - hctx->ccid2hctx_seqbuf[hctx->ccid2hctx_seqbufc] = seqp; - hctx->ccid2hctx_seqbufc++; + hctx->seqbuf[hctx->seqbufc] = seqp; + hctx->seqbufc++; return 0; } @@ -126,7 +125,7 @@ static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - if (hctx->ccid2hctx_pipe < hctx->ccid2hctx_cwnd) + if (hctx->pipe < hctx->cwnd) return 0; return 1; /* XXX CCID should dequeue when ready instead of polling */ @@ -135,7 +134,7 @@ static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) { struct dccp_sock *dp = dccp_sk(sk); - u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->ccid2hctx_cwnd, 2); + u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->cwnd, 2); /* * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from @@ -160,7 +159,7 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val) { ccid2_pr_debug("change SRTT to %ld\n", val); - hctx->ccid2hctx_srtt = val; + hctx->srtt = val; } static void ccid2_start_rto_timer(struct sock *sk); @@ -173,8 +172,7 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) bh_lock_sock(sk); if (sock_owned_by_user(sk)) { - sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, - jiffies + HZ / 5); + sk_reset_timer(sk, &hctx->rtotimer, jiffies + HZ / 5); goto out; } @@ -183,28 +181,28 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) ccid2_hc_tx_check_sanity(hctx); /* back-off timer */ - hctx->ccid2hctx_rto <<= 1; + hctx->rto <<= 1; - s = hctx->ccid2hctx_rto / HZ; + s = hctx->rto / HZ; if (s > 60) - hctx->ccid2hctx_rto = 60 * HZ; + hctx->rto = 60 * HZ; ccid2_start_rto_timer(sk); /* adjust pipe, cwnd etc */ - hctx->ccid2hctx_ssthresh = hctx->ccid2hctx_cwnd / 2; - if (hctx->ccid2hctx_ssthresh < 2) - hctx->ccid2hctx_ssthresh = 2; - hctx->ccid2hctx_cwnd = 1; - hctx->ccid2hctx_pipe = 0; + hctx->ssthresh = hctx->cwnd / 2; + if (hctx->ssthresh < 2) + hctx->ssthresh = 2; + hctx->cwnd = 1; + hctx->pipe = 0; /* clear state about stuff we sent */ - hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqh; - hctx->ccid2hctx_packets_acked = 0; + hctx->seqt = hctx->seqh; + hctx->packets_acked = 0; /* clear ack ratio state. */ - hctx->ccid2hctx_rpseq = 0; - hctx->ccid2hctx_rpdupack = -1; + hctx->rpseq = 0; + hctx->rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); ccid2_hc_tx_check_sanity(hctx); out: @@ -216,11 +214,11 @@ static void ccid2_start_rto_timer(struct sock *sk) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->ccid2hctx_rto); + ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->rto); - BUG_ON(timer_pending(&hctx->ccid2hctx_rtotimer)); - sk_reset_timer(sk, &hctx->ccid2hctx_rtotimer, - jiffies + hctx->ccid2hctx_rto); + BUG_ON(timer_pending(&hctx->rtotimer)); + sk_reset_timer(sk, &hctx->rtotimer, + jiffies + hctx->rto); } static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) @@ -229,27 +227,26 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); struct ccid2_seq *next; - hctx->ccid2hctx_pipe++; + hctx->pipe++; - hctx->ccid2hctx_seqh->ccid2s_seq = dp->dccps_gss; - hctx->ccid2hctx_seqh->ccid2s_acked = 0; - hctx->ccid2hctx_seqh->ccid2s_sent = jiffies; + hctx->seqh->ccid2s_seq = dp->dccps_gss; + hctx->seqh->ccid2s_acked = 0; + hctx->seqh->ccid2s_sent = jiffies; - next = hctx->ccid2hctx_seqh->ccid2s_next; + next = hctx->seqh->ccid2s_next; /* check if we need to alloc more space */ - if (next == hctx->ccid2hctx_seqt) { + if (next == hctx->seqt) { if (ccid2_hc_tx_alloc_seq(hctx)) { DCCP_CRIT("packet history - out of memory!"); /* FIXME: find a more graceful way to bail out */ return; } - next = hctx->ccid2hctx_seqh->ccid2s_next; - BUG_ON(next == hctx->ccid2hctx_seqt); + next = hctx->seqh->ccid2s_next; + BUG_ON(next == hctx->seqt); } - hctx->ccid2hctx_seqh = next; + hctx->seqh = next; - ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->ccid2hctx_cwnd, - hctx->ccid2hctx_pipe); + ccid2_pr_debug("cwnd=%d pipe=%d\n", hctx->cwnd, hctx->pipe); /* * FIXME: The code below is broken and the variables have been removed @@ -272,12 +269,12 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) */ #if 0 /* Ack Ratio. Need to maintain a concept of how many windows we sent */ - hctx->ccid2hctx_arsent++; + hctx->arsent++; /* We had an ack loss in this window... */ - if (hctx->ccid2hctx_ackloss) { - if (hctx->ccid2hctx_arsent >= hctx->ccid2hctx_cwnd) { - hctx->ccid2hctx_arsent = 0; - hctx->ccid2hctx_ackloss = 0; + if (hctx->ackloss) { + if (hctx->arsent >= hctx->cwnd) { + hctx->arsent = 0; + hctx->ackloss = 0; } } else { /* No acks lost up to now... */ @@ -287,28 +284,28 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio - dp->dccps_l_ack_ratio; - denom = hctx->ccid2hctx_cwnd * hctx->ccid2hctx_cwnd / denom; + denom = hctx->cwnd * hctx->cwnd / denom; - if (hctx->ccid2hctx_arsent >= denom) { + if (hctx->arsent >= denom) { ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1); - hctx->ccid2hctx_arsent = 0; + hctx->arsent = 0; } } else { /* we can't increase ack ratio further [1] */ - hctx->ccid2hctx_arsent = 0; /* or maybe set it to cwnd*/ + hctx->arsent = 0; /* or maybe set it to cwnd*/ } } #endif /* setup RTO timer */ - if (!timer_pending(&hctx->ccid2hctx_rtotimer)) + if (!timer_pending(&hctx->rtotimer)) ccid2_start_rto_timer(sk); #ifdef CONFIG_IP_DCCP_CCID2_DEBUG do { - struct ccid2_seq *seqp = hctx->ccid2hctx_seqt; + struct ccid2_seq *seqp = hctx->seqt; - while (seqp != hctx->ccid2hctx_seqh) { + while (seqp != hctx->seqh) { ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", (unsigned long long)seqp->ccid2s_seq, seqp->ccid2s_acked, seqp->ccid2s_sent); @@ -386,7 +383,7 @@ static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - sk_stop_timer(sk, &hctx->ccid2hctx_rtotimer); + sk_stop_timer(sk, &hctx->rtotimer); ccid2_pr_debug("deleted RTO timer\n"); } @@ -396,73 +393,73 @@ static inline void ccid2_new_ack(struct sock *sk, { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) { - if (*maxincr > 0 && ++hctx->ccid2hctx_packets_acked == 2) { - hctx->ccid2hctx_cwnd += 1; - *maxincr -= 1; - hctx->ccid2hctx_packets_acked = 0; + if (hctx->cwnd < hctx->ssthresh) { + if (*maxincr > 0 && ++hctx->packets_acked == 2) { + hctx->cwnd += 1; + *maxincr -= 1; + hctx->packets_acked = 0; } - } else if (++hctx->ccid2hctx_packets_acked >= hctx->ccid2hctx_cwnd) { - hctx->ccid2hctx_cwnd += 1; - hctx->ccid2hctx_packets_acked = 0; + } else if (++hctx->packets_acked >= hctx->cwnd) { + hctx->cwnd += 1; + hctx->packets_acked = 0; } /* update RTO */ - if (hctx->ccid2hctx_srtt == -1 || - time_after(jiffies, hctx->ccid2hctx_lastrtt + hctx->ccid2hctx_srtt)) { + if (hctx->srtt == -1 || + time_after(jiffies, hctx->lastrtt + hctx->srtt)) { unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; int s; /* first measurement */ - if (hctx->ccid2hctx_srtt == -1) { + if (hctx->srtt == -1) { ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n", r, jiffies, (unsigned long long)seqp->ccid2s_seq); ccid2_change_srtt(hctx, r); - hctx->ccid2hctx_rttvar = r >> 1; + hctx->rttvar = r >> 1; } else { /* RTTVAR */ - long tmp = hctx->ccid2hctx_srtt - r; + long tmp = hctx->srtt - r; long srtt; if (tmp < 0) tmp *= -1; tmp >>= 2; - hctx->ccid2hctx_rttvar *= 3; - hctx->ccid2hctx_rttvar >>= 2; - hctx->ccid2hctx_rttvar += tmp; + hctx->rttvar *= 3; + hctx->rttvar >>= 2; + hctx->rttvar += tmp; /* SRTT */ - srtt = hctx->ccid2hctx_srtt; + srtt = hctx->srtt; srtt *= 7; srtt >>= 3; tmp = r >> 3; srtt += tmp; ccid2_change_srtt(hctx, srtt); } - s = hctx->ccid2hctx_rttvar << 2; + s = hctx->rttvar << 2; /* clock granularity is 1 when based on jiffies */ if (!s) s = 1; - hctx->ccid2hctx_rto = hctx->ccid2hctx_srtt + s; + hctx->rto = hctx->srtt + s; /* must be at least a second */ - s = hctx->ccid2hctx_rto / HZ; + s = hctx->rto / HZ; /* DCCP doesn't require this [but I like it cuz my code sux] */ #if 1 if (s < 1) - hctx->ccid2hctx_rto = HZ; + hctx->rto = HZ; #endif /* max 60 seconds */ if (s > 60) - hctx->ccid2hctx_rto = HZ * 60; + hctx->rto = HZ * 60; - hctx->ccid2hctx_lastrtt = jiffies; + hctx->lastrtt = jiffies; ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", - hctx->ccid2hctx_srtt, hctx->ccid2hctx_rttvar, - hctx->ccid2hctx_rto, HZ, r); + hctx->srtt, hctx->rttvar, + hctx->rto, HZ, r); } /* we got a new ack, so re-start RTO timer */ @@ -474,12 +471,12 @@ static void ccid2_hc_tx_dec_pipe(struct sock *sk) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - if (hctx->ccid2hctx_pipe == 0) + if (hctx->pipe == 0) DCCP_BUG("pipe == 0"); else - hctx->ccid2hctx_pipe--; + hctx->pipe--; - if (hctx->ccid2hctx_pipe == 0) + if (hctx->pipe == 0) ccid2_hc_tx_kill_rto_timer(sk); } @@ -487,19 +484,19 @@ static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - if (time_before(seqp->ccid2s_sent, hctx->ccid2hctx_last_cong)) { + if (time_before(seqp->ccid2s_sent, hctx->last_cong)) { ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); return; } - hctx->ccid2hctx_last_cong = jiffies; + hctx->last_cong = jiffies; - hctx->ccid2hctx_cwnd = hctx->ccid2hctx_cwnd / 2 ? : 1U; - hctx->ccid2hctx_ssthresh = max(hctx->ccid2hctx_cwnd, 2U); + hctx->cwnd = hctx->cwnd / 2 ? : 1U; + hctx->ssthresh = max(hctx->cwnd, 2U); /* Avoid spurious timeouts resulting from Ack Ratio > cwnd */ - if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->ccid2hctx_cwnd) - ccid2_change_l_ack_ratio(sk, hctx->ccid2hctx_cwnd); + if (dccp_sk(sk)->dccps_l_ack_ratio > hctx->cwnd) + ccid2_change_l_ack_ratio(sk, hctx->cwnd); } static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) @@ -523,21 +520,21 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * -sorbo. */ /* need to bootstrap */ - if (hctx->ccid2hctx_rpdupack == -1) { - hctx->ccid2hctx_rpdupack = 0; - hctx->ccid2hctx_rpseq = seqno; + if (hctx->rpdupack == -1) { + hctx->rpdupack = 0; + hctx->rpseq = seqno; } else { /* check if packet is consecutive */ - if (dccp_delta_seqno(hctx->ccid2hctx_rpseq, seqno) == 1) - hctx->ccid2hctx_rpseq = seqno; + if (dccp_delta_seqno(hctx->rpseq, seqno) == 1) + hctx->rpseq = seqno; /* it's a later packet */ - else if (after48(seqno, hctx->ccid2hctx_rpseq)) { - hctx->ccid2hctx_rpdupack++; + else if (after48(seqno, hctx->rpseq)) { + hctx->rpdupack++; /* check if we got enough dupacks */ - if (hctx->ccid2hctx_rpdupack >= NUMDUPACK) { - hctx->ccid2hctx_rpdupack = -1; /* XXX lame */ - hctx->ccid2hctx_rpseq = 0; + if (hctx->rpdupack >= NUMDUPACK) { + hctx->rpdupack = -1; /* XXX lame */ + hctx->rpseq = 0; ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio); } @@ -546,7 +543,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* check forward path congestion */ /* still didn't send out new data packets */ - if (hctx->ccid2hctx_seqh == hctx->ccid2hctx_seqt) + if (hctx->seqh == hctx->seqt) return; switch (DCCP_SKB_CB(skb)->dccpd_type) { @@ -558,14 +555,14 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) } ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; - if (after48(ackno, hctx->ccid2hctx_high_ack)) - hctx->ccid2hctx_high_ack = ackno; + if (after48(ackno, hctx->high_ack)) + hctx->high_ack = ackno; - seqp = hctx->ccid2hctx_seqt; + seqp = hctx->seqt; while (before48(seqp->ccid2s_seq, ackno)) { seqp = seqp->ccid2s_next; - if (seqp == hctx->ccid2hctx_seqh) { - seqp = hctx->ccid2hctx_seqh->ccid2s_prev; + if (seqp == hctx->seqh) { + seqp = hctx->seqh->ccid2s_prev; break; } } @@ -575,7 +572,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * packets per acknowledgement. Rounding up avoids that cwnd is not * advanced when Ack Ratio is 1 and gives a slight edge otherwise. */ - if (hctx->ccid2hctx_cwnd < hctx->ccid2hctx_ssthresh) + if (hctx->cwnd < hctx->ssthresh) maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); /* go through all ack vectors */ @@ -594,7 +591,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * seqnos. */ while (after48(seqp->ccid2s_seq, ackno)) { - if (seqp == hctx->ccid2hctx_seqt) { + if (seqp == hctx->seqt) { done = 1; break; } @@ -626,7 +623,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) (unsigned long long)seqp->ccid2s_seq); ccid2_hc_tx_dec_pipe(sk); } - if (seqp == hctx->ccid2hctx_seqt) { + if (seqp == hctx->seqt) { done = 1; break; } @@ -645,11 +642,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* The state about what is acked should be correct now * Check for NUMDUPACK */ - seqp = hctx->ccid2hctx_seqt; - while (before48(seqp->ccid2s_seq, hctx->ccid2hctx_high_ack)) { + seqp = hctx->seqt; + while (before48(seqp->ccid2s_seq, hctx->high_ack)) { seqp = seqp->ccid2s_next; - if (seqp == hctx->ccid2hctx_seqh) { - seqp = hctx->ccid2hctx_seqh->ccid2s_prev; + if (seqp == hctx->seqh) { + seqp = hctx->seqh->ccid2s_prev; break; } } @@ -660,7 +657,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) if (done == NUMDUPACK) break; } - if (seqp == hctx->ccid2hctx_seqt) + if (seqp == hctx->seqt) break; seqp = seqp->ccid2s_prev; } @@ -683,20 +680,20 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) ccid2_congestion_event(sk, seqp); ccid2_hc_tx_dec_pipe(sk); } - if (seqp == hctx->ccid2hctx_seqt) + if (seqp == hctx->seqt) break; seqp = seqp->ccid2s_prev; } - hctx->ccid2hctx_seqt = last_acked; + hctx->seqt = last_acked; } /* trim acked packets in tail */ - while (hctx->ccid2hctx_seqt != hctx->ccid2hctx_seqh) { - if (!hctx->ccid2hctx_seqt->ccid2s_acked) + while (hctx->seqt != hctx->seqh) { + if (!hctx->seqt->ccid2s_acked) break; - hctx->ccid2hctx_seqt = hctx->ccid2hctx_seqt->ccid2s_next; + hctx->seqt = hctx->seqt->ccid2s_next; } ccid2_hc_tx_check_sanity(hctx); @@ -709,17 +706,17 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) u32 max_ratio; /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ - hctx->ccid2hctx_ssthresh = ~0U; + hctx->ssthresh = ~0U; /* * RFC 4341, 5: "The cwnd parameter is initialized to at most four * packets for new connections, following the rules from [RFC3390]". * We need to convert the bytes of RFC3390 into the packets of RFC 4341. */ - hctx->ccid2hctx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U); + hctx->cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U); /* Make sure that Ack Ratio is enabled and within bounds. */ - max_ratio = DIV_ROUND_UP(hctx->ccid2hctx_cwnd, 2); + max_ratio = DIV_ROUND_UP(hctx->cwnd, 2); if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio) dp->dccps_l_ack_ratio = max_ratio; @@ -727,13 +724,12 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) if (ccid2_hc_tx_alloc_seq(hctx)) return -ENOMEM; - hctx->ccid2hctx_rto = 3 * HZ; + hctx->rto = 3 * HZ; ccid2_change_srtt(hctx, -1); - hctx->ccid2hctx_rttvar = -1; - hctx->ccid2hctx_rpdupack = -1; - hctx->ccid2hctx_last_cong = jiffies; - setup_timer(&hctx->ccid2hctx_rtotimer, ccid2_hc_tx_rto_expire, - (unsigned long)sk); + hctx->rttvar = -1; + hctx->rpdupack = -1; + hctx->last_cong = jiffies; + setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); ccid2_hc_tx_check_sanity(hctx); return 0; @@ -746,9 +742,9 @@ static void ccid2_hc_tx_exit(struct sock *sk) ccid2_hc_tx_kill_rto_timer(sk); - for (i = 0; i < hctx->ccid2hctx_seqbufc; i++) - kfree(hctx->ccid2hctx_seqbuf[i]); - hctx->ccid2hctx_seqbufc = 0; + for (i = 0; i < hctx->seqbufc; i++) + kfree(hctx->seqbuf[i]); + hctx->seqbufc = 0; } static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) @@ -759,10 +755,10 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) switch (DCCP_SKB_CB(skb)->dccpd_type) { case DCCP_PKT_DATA: case DCCP_PKT_DATAACK: - hcrx->ccid2hcrx_data++; - if (hcrx->ccid2hcrx_data >= dp->dccps_r_ack_ratio) { + hcrx->data++; + if (hcrx->data >= dp->dccps_r_ack_ratio) { dccp_send_ack(sk); - hcrx->ccid2hcrx_data = 0; + hcrx->data = 0; } break; } diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 2c94ca029010..d7815804bceb 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -42,34 +42,34 @@ struct ccid2_seq { /** struct ccid2_hc_tx_sock - CCID2 TX half connection * - * @ccid2hctx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 - * @ccid2hctx_packets_acked - Ack counter for deriving cwnd growth (RFC 3465) - * @ccid2hctx_lastrtt -time RTT was last measured - * @ccid2hctx_rpseq - last consecutive seqno - * @ccid2hctx_rpdupack - dupacks since rpseq -*/ + * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 + * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465) + * @lastrtt: time RTT was last measured + * @rpseq: last consecutive seqno + * @rpdupack: dupacks since rpseq + */ struct ccid2_hc_tx_sock { - u32 ccid2hctx_cwnd; - u32 ccid2hctx_ssthresh; - u32 ccid2hctx_pipe; - u32 ccid2hctx_packets_acked; - struct ccid2_seq *ccid2hctx_seqbuf[CCID2_SEQBUF_MAX]; - int ccid2hctx_seqbufc; - struct ccid2_seq *ccid2hctx_seqh; - struct ccid2_seq *ccid2hctx_seqt; - long ccid2hctx_rto; - long ccid2hctx_srtt; - long ccid2hctx_rttvar; - unsigned long ccid2hctx_lastrtt; - struct timer_list ccid2hctx_rtotimer; - u64 ccid2hctx_rpseq; - int ccid2hctx_rpdupack; - unsigned long ccid2hctx_last_cong; - u64 ccid2hctx_high_ack; + u32 cwnd; + u32 ssthresh; + u32 pipe; + u32 packets_acked; + struct ccid2_seq *seqbuf[CCID2_SEQBUF_MAX]; + int seqbufc; + struct ccid2_seq *seqh; + struct ccid2_seq *seqt; + long rto; + long srtt; + long rttvar; + unsigned long lastrtt; + struct timer_list rtotimer; + u64 rpseq; + int rpdupack; + unsigned long last_cong; + u64 high_ack; }; struct ccid2_hc_rx_sock { - int ccid2hcrx_data; + int data; }; static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) From 842d1ef14ff37e9611eab479f31a0d74c1a5c4c0 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 050/105] dccp ccid-3: Remove ccid3hc{tx,rx}_ prefixes This patch does the same for CCID-3 as the previous patch for CCID-2: s#ccid3hctx_##g; s#ccid3hcrx_##g; plus manual editing to retain consistency. Please note: expanded the fields of the `struct tfrc_tx_info' in the hc_tx_sock, since using short #define identifiers is not a good idea. The only place where this embedded struct was used is ccid3_hc_tx_getsockopt(). Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 337 +++++++++++++++++++---------------------- net/dccp/ccids/ccid3.h | 119 ++++++++------- net/dccp/probe.c | 6 +- 3 files changed, 220 insertions(+), 242 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 3b8bd7ca6761..2f026ce27148 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -67,13 +67,13 @@ static void ccid3_hc_tx_set_state(struct sock *sk, enum ccid3_hc_tx_states state) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state; + enum ccid3_hc_tx_states oldstate = hctx->state; ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", dccp_role(sk), sk, ccid3_tx_state_name(oldstate), ccid3_tx_state_name(state)); WARN_ON(state == oldstate); - hctx->ccid3hctx_state = state; + hctx->state = state; } /* @@ -88,10 +88,9 @@ static void ccid3_hc_tx_set_state(struct sock *sk, static inline u64 rfc3390_initial_rate(struct sock *sk) { const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - const __u32 w_init = clamp_t(__u32, 4380U, - 2 * hctx->ccid3hctx_s, 4 * hctx->ccid3hctx_s); + const __u32 w_init = clamp_t(__u32, 4380U, 2 * hctx->s, 4 * hctx->s); - return scaled_div(w_init << 6, hctx->ccid3hctx_rtt); + return scaled_div(w_init << 6, hctx->rtt); } /* @@ -100,24 +99,20 @@ static inline u64 rfc3390_initial_rate(struct sock *sk) static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) { /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ - hctx->ccid3hctx_t_ipi = scaled_div32(((u64)hctx->ccid3hctx_s) << 6, - hctx->ccid3hctx_x); + hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x); /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ - hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2, - TFRC_OPSYS_HALF_TIME_GRAN); - - ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", - hctx->ccid3hctx_t_ipi, hctx->ccid3hctx_delta, - hctx->ccid3hctx_s, (unsigned)(hctx->ccid3hctx_x >> 6)); + hctx->delta = min_t(u32, hctx->t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN); + ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", hctx->t_ipi, + hctx->delta, hctx->s, (unsigned)(hctx->x >> 6)); } static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) { - u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count); + u32 delta = ktime_us_delta(now, hctx->t_last_win_count); - return delta / hctx->ccid3hctx_rtt; + return delta / hctx->rtt; } /** @@ -133,8 +128,8 @@ static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - __u64 min_rate = 2 * hctx->ccid3hctx_x_recv; - const __u64 old_x = hctx->ccid3hctx_x; + u64 min_rate = 2 * hctx->x_recv; + const u64 old_x = hctx->x; ktime_t now = stamp ? *stamp : ktime_get_real(); /* @@ -145,33 +140,27 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) */ if (ccid3_hc_tx_idle_rtt(hctx, now) >= 2) { min_rate = rfc3390_initial_rate(sk); - min_rate = max(min_rate, 2 * hctx->ccid3hctx_x_recv); + min_rate = max(min_rate, 2 * hctx->x_recv); } - if (hctx->ccid3hctx_p > 0) { + if (hctx->p > 0) { - hctx->ccid3hctx_x = min(((__u64)hctx->ccid3hctx_x_calc) << 6, - min_rate); - hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, - (((__u64)hctx->ccid3hctx_s) << 6) / - TFRC_T_MBI); + hctx->x = min(((u64)hctx->x_calc) << 6, min_rate); + hctx->x = max(hctx->x, (((u64)hctx->s) << 6) / TFRC_T_MBI); - } else if (ktime_us_delta(now, hctx->ccid3hctx_t_ld) - - (s64)hctx->ccid3hctx_rtt >= 0) { + } else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) { - hctx->ccid3hctx_x = min(2 * hctx->ccid3hctx_x, min_rate); - hctx->ccid3hctx_x = max(hctx->ccid3hctx_x, - scaled_div(((__u64)hctx->ccid3hctx_s) << 6, - hctx->ccid3hctx_rtt)); - hctx->ccid3hctx_t_ld = now; + hctx->x = min(2 * hctx->x, min_rate); + hctx->x = max(hctx->x, + scaled_div(((u64)hctx->s) << 6, hctx->rtt)); + hctx->t_ld = now; } - if (hctx->ccid3hctx_x != old_x) { + if (hctx->x != old_x) { ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, " "X_recv=%u\n", (unsigned)(old_x >> 6), - (unsigned)(hctx->ccid3hctx_x >> 6), - hctx->ccid3hctx_x_calc, - (unsigned)(hctx->ccid3hctx_x_recv >> 6)); + (unsigned)(hctx->x >> 6), hctx->x_calc, + (unsigned)(hctx->x_recv >> 6)); ccid3_update_send_interval(hctx); } @@ -183,11 +172,11 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) */ static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) { - const u16 old_s = hctx->ccid3hctx_s; + const u16 old_s = hctx->s; - hctx->ccid3hctx_s = tfrc_ewma(hctx->ccid3hctx_s, len, 9); + hctx->s = tfrc_ewma(hctx->s, len, 9); - if (hctx->ccid3hctx_s != old_s) + if (hctx->s != old_s) ccid3_update_send_interval(hctx); } @@ -198,13 +187,13 @@ static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, ktime_t now) { - u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count), - quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt; + u32 delta = ktime_us_delta(now, hctx->t_last_win_count), + quarter_rtts = (4 * delta) / hctx->rtt; if (quarter_rtts > 0) { - hctx->ccid3hctx_t_last_win_count = now; - hctx->ccid3hctx_last_win_count += min(quarter_rtts, 5U); - hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */ + hctx->t_last_win_count = now; + hctx->last_win_count += min(quarter_rtts, 5U); + hctx->last_win_count &= 0xF; /* mod 16 */ } } @@ -222,23 +211,21 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) } ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, - ccid3_tx_state_name(hctx->ccid3hctx_state)); + ccid3_tx_state_name(hctx->state)); - if (hctx->ccid3hctx_state == TFRC_SSTATE_FBACK) + if (hctx->state == TFRC_SSTATE_FBACK) ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - else if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) + else if (hctx->state != TFRC_SSTATE_NO_FBACK) goto out; /* * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 + * RTO is 0 if and only if no feedback has been received yet. */ - if (hctx->ccid3hctx_t_rto == 0 || /* no feedback received yet */ - hctx->ccid3hctx_p == 0) { + if (hctx->t_rto == 0 || hctx->p == 0) { /* halve send rate directly */ - hctx->ccid3hctx_x = max(hctx->ccid3hctx_x / 2, - (((__u64)hctx->ccid3hctx_s) << 6) / - TFRC_T_MBI); + hctx->x = max(hctx->x / 2, (((u64)hctx->s) << 6) / TFRC_T_MBI); ccid3_update_send_interval(hctx); } else { /* @@ -251,33 +238,32 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) * * Note that X_recv is scaled by 2^6 while X_calc is not */ - BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); + BUG_ON(hctx->p && !hctx->x_calc); - if (hctx->ccid3hctx_x_calc > (hctx->ccid3hctx_x_recv >> 5)) - hctx->ccid3hctx_x_recv = - max(hctx->ccid3hctx_x_recv / 2, - (((__u64)hctx->ccid3hctx_s) << 6) / - (2 * TFRC_T_MBI)); + if (hctx->x_calc > (hctx->x_recv >> 5)) + hctx->x_recv = + max(hctx->x_recv / 2, + (((__u64)hctx->s) << 6) / (2 * TFRC_T_MBI)); else { - hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc; - hctx->ccid3hctx_x_recv <<= 4; + hctx->x_recv = hctx->x_calc; + hctx->x_recv <<= 4; } ccid3_hc_tx_update_x(sk, NULL); } ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n", - (unsigned long long)hctx->ccid3hctx_x); + (unsigned long long)hctx->x); /* * Set new timeout for the nofeedback timer. * See comments in packet_recv() regarding the value of t_RTO. */ - if (unlikely(hctx->ccid3hctx_t_rto == 0)) /* no feedback yet */ + if (unlikely(hctx->t_rto == 0)) /* no feedback received yet */ t_nfb = TFRC_INITIAL_TIMEOUT; else - t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); + t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); restart_timer: - sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + sk_reset_timer(sk, &hctx->no_feedback_timer, jiffies + usecs_to_jiffies(t_nfb)); out: bh_unlock_sock(sk); @@ -305,18 +291,17 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) if (unlikely(skb->len == 0)) return -EBADMSG; - switch (hctx->ccid3hctx_state) { + switch (hctx->state) { case TFRC_SSTATE_NO_SENT: - sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, - (jiffies + + sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); - hctx->ccid3hctx_last_win_count = 0; - hctx->ccid3hctx_t_last_win_count = now; + hctx->last_win_count = 0; + hctx->t_last_win_count = now; /* Set t_0 for initial packet */ - hctx->ccid3hctx_t_nom = now; + hctx->t_nom = now; - hctx->ccid3hctx_s = skb->len; + hctx->s = skb->len; /* * Use initial RTT sample when available: recommended by erratum @@ -325,9 +310,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) */ if (dp->dccps_syn_rtt) { ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt); - hctx->ccid3hctx_rtt = dp->dccps_syn_rtt; - hctx->ccid3hctx_x = rfc3390_initial_rate(sk); - hctx->ccid3hctx_t_ld = now; + hctx->rtt = dp->dccps_syn_rtt; + hctx->x = rfc3390_initial_rate(sk); + hctx->t_ld = now; } else { /* * Sender does not have RTT sample: @@ -335,9 +320,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) * is needed in several parts (e.g. window counter); * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. */ - hctx->ccid3hctx_rtt = DCCP_FALLBACK_RTT; - hctx->ccid3hctx_x = hctx->ccid3hctx_s; - hctx->ccid3hctx_x <<= 6; + hctx->rtt = DCCP_FALLBACK_RTT; + hctx->x = hctx->s; + hctx->x <<= 6; } ccid3_update_send_interval(hctx); @@ -345,7 +330,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) break; case TFRC_SSTATE_NO_FBACK: case TFRC_SSTATE_FBACK: - delay = ktime_us_delta(hctx->ccid3hctx_t_nom, now); + delay = ktime_us_delta(hctx->t_nom, now); ccid3_pr_debug("delay=%ld\n", (long)delay); /* * Scheduling of packet transmissions [RFC 3448, 4.6] @@ -355,7 +340,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) * else * // send the packet in (t_nom - t_now) milliseconds. */ - if (delay - (s64)hctx->ccid3hctx_delta >= 1000) + if (delay - (s64)hctx->delta >= 1000) return (u32)delay / 1000L; ccid3_hc_tx_update_win_count(hctx, now); @@ -367,11 +352,10 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) /* prepare to send now (add options etc.) */ dp->dccps_hc_tx_insert_options = 1; - DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; + DCCP_SKB_CB(skb)->dccpd_ccval = hctx->last_win_count; /* set the nominal send time for the next following packet */ - hctx->ccid3hctx_t_nom = ktime_add_us(hctx->ccid3hctx_t_nom, - hctx->ccid3hctx_t_ipi); + hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi); return 0; } @@ -382,14 +366,14 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, ccid3_hc_tx_update_s(hctx, len); - if (tfrc_tx_hist_add(&hctx->ccid3hctx_hist, dccp_sk(sk)->dccps_gss)) + if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss)) DCCP_CRIT("packet history - out of memory!"); } static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv; + struct ccid3_options_received *opt_recv = &hctx->options_received; ktime_t now; unsigned long t_nfb; u32 pinv, r_sample; @@ -399,15 +383,14 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) return; /* ... and only in the established state */ - if (hctx->ccid3hctx_state != TFRC_SSTATE_FBACK && - hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK) + if (hctx->state != TFRC_SSTATE_FBACK && + hctx->state != TFRC_SSTATE_NO_FBACK) return; - opt_recv = &hctx->ccid3hctx_options_received; now = ktime_get_real(); /* Estimate RTT from history if ACK number is valid */ - r_sample = tfrc_tx_hist_rtt(hctx->ccid3hctx_hist, + r_sample = tfrc_tx_hist_rtt(hctx->hist, DCCP_SKB_CB(skb)->dccpd_ack_seq, now); if (r_sample == 0) { DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk, @@ -417,37 +400,37 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) } /* Update receive rate in units of 64 * bytes/second */ - hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate; - hctx->ccid3hctx_x_recv <<= 6; + hctx->x_recv = opt_recv->ccid3or_receive_rate; + hctx->x_recv <<= 6; /* Update loss event rate (which is scaled by 1e6) */ pinv = opt_recv->ccid3or_loss_event_rate; if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ - hctx->ccid3hctx_p = 0; + hctx->p = 0; else /* can not exceed 100% */ - hctx->ccid3hctx_p = scaled_div(1, pinv); + hctx->p = scaled_div(1, pinv); /* * Validate new RTT sample and update moving average */ r_sample = dccp_sample_rtt(sk, r_sample); - hctx->ccid3hctx_rtt = tfrc_ewma(hctx->ccid3hctx_rtt, r_sample, 9); + hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9); /* * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 */ - if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) { + if (hctx->state == TFRC_SSTATE_NO_FBACK) { ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); - if (hctx->ccid3hctx_t_rto == 0) { + if (hctx->t_rto == 0) { /* * Initial feedback packet: Larger Initial Windows (4.2) */ - hctx->ccid3hctx_x = rfc3390_initial_rate(sk); - hctx->ccid3hctx_t_ld = now; + hctx->x = rfc3390_initial_rate(sk); + hctx->t_ld = now; ccid3_update_send_interval(hctx); goto done_computing_x; - } else if (hctx->ccid3hctx_p == 0) { + } else if (hctx->p == 0) { /* * First feedback after nofeedback timer expiry (4.3) */ @@ -456,25 +439,20 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) } /* Update sending rate (step 4 of [RFC 3448, 4.3]) */ - if (hctx->ccid3hctx_p > 0) - hctx->ccid3hctx_x_calc = - tfrc_calc_x(hctx->ccid3hctx_s, - hctx->ccid3hctx_rtt, - hctx->ccid3hctx_p); + if (hctx->p > 0) + hctx->x_calc = tfrc_calc_x(hctx->s, hctx->rtt, hctx->p); ccid3_hc_tx_update_x(sk, &now); done_computing_x: ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, " "p=%u, X_calc=%u, X_recv=%u, X=%u\n", - dccp_role(sk), - sk, hctx->ccid3hctx_rtt, r_sample, - hctx->ccid3hctx_s, hctx->ccid3hctx_p, - hctx->ccid3hctx_x_calc, - (unsigned)(hctx->ccid3hctx_x_recv >> 6), - (unsigned)(hctx->ccid3hctx_x >> 6)); + dccp_role(sk), sk, hctx->rtt, r_sample, + hctx->s, hctx->p, hctx->x_calc, + (unsigned)(hctx->x_recv >> 6), + (unsigned)(hctx->x >> 6)); /* unschedule no feedback timer */ - sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); + sk_stop_timer(sk, &hctx->no_feedback_timer); /* * As we have calculated new ipi, delta, t_nom it is possible @@ -488,21 +466,19 @@ done_computing_x: * This can help avoid triggering the nofeedback timer too * often ('spinning') on LANs with small RTTs. */ - hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, - (CONFIG_IP_DCCP_CCID3_RTO * - (USEC_PER_SEC / 1000))); + hctx->t_rto = max_t(u32, 4 * hctx->rtt, (CONFIG_IP_DCCP_CCID3_RTO * + (USEC_PER_SEC / 1000))); /* * Schedule no feedback timer to expire in * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) */ - t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); + t_nfb = max(hctx->t_rto, 2 * hctx->t_ipi); ccid3_pr_debug("%s(%p), Scheduled no feedback timer to " "expire in %lu jiffies (%luus)\n", - dccp_role(sk), - sk, usecs_to_jiffies(t_nfb), t_nfb); + dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb); - sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer, + sk_reset_timer(sk, &hctx->no_feedback_timer, jiffies + usecs_to_jiffies(t_nfb)); } @@ -513,11 +489,9 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, int rc = 0; const struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv; + struct ccid3_options_received *opt_recv = &hctx->options_received; __be32 opt_val; - opt_recv = &hctx->ccid3hctx_options_received; - if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { opt_recv->ccid3or_seqno = dp->dccps_gsr; opt_recv->ccid3or_loss_event_rate = ~0; @@ -572,11 +546,10 @@ static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) { struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); - hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT; - hctx->ccid3hctx_hist = NULL; - setup_timer(&hctx->ccid3hctx_no_feedback_timer, - ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); - + hctx->state = TFRC_SSTATE_NO_SENT; + hctx->hist = NULL; + setup_timer(&hctx->no_feedback_timer, + ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); return 0; } @@ -585,9 +558,9 @@ static void ccid3_hc_tx_exit(struct sock *sk) struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); - sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer); + sk_stop_timer(sk, &hctx->no_feedback_timer); - tfrc_tx_hist_purge(&hctx->ccid3hctx_hist); + tfrc_tx_hist_purge(&hctx->hist); } static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) @@ -599,14 +572,15 @@ static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) return; hctx = ccid3_hc_tx_sk(sk); - info->tcpi_rto = hctx->ccid3hctx_t_rto; - info->tcpi_rtt = hctx->ccid3hctx_rtt; + info->tcpi_rto = hctx->t_rto; + info->tcpi_rtt = hctx->rtt; } static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { const struct ccid3_hc_tx_sock *hctx; + struct tfrc_tx_info tfrc; const void *val; /* Listen socks doesn't have a private CCID block */ @@ -616,10 +590,17 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, hctx = ccid3_hc_tx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_TX_INFO: - if (len < sizeof(hctx->ccid3hctx_tfrc)) + if (len < sizeof(tfrc)) return -EINVAL; - len = sizeof(hctx->ccid3hctx_tfrc); - val = &hctx->ccid3hctx_tfrc; + tfrc.tfrctx_x = hctx->x; + tfrc.tfrctx_x_recv = hctx->x_recv; + tfrc.tfrctx_x_calc = hctx->x_calc; + tfrc.tfrctx_rtt = hctx->rtt; + tfrc.tfrctx_p = hctx->p; + tfrc.tfrctx_rto = hctx->t_rto; + tfrc.tfrctx_ipi = hctx->t_ipi; + len = sizeof(tfrc); + val = &tfrc; break; default: return -ENOPROTOOPT; @@ -660,13 +641,13 @@ static void ccid3_hc_rx_set_state(struct sock *sk, enum ccid3_hc_rx_states state) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state; + enum ccid3_hc_rx_states oldstate = hcrx->state; ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", dccp_role(sk), sk, ccid3_rx_state_name(oldstate), ccid3_rx_state_name(state)); WARN_ON(state == oldstate); - hcrx->ccid3hcrx_state = state; + hcrx->state = state; } static void ccid3_hc_rx_send_feedback(struct sock *sk, @@ -678,15 +659,15 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, ktime_t now; s64 delta = 0; - if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_TERM)) + if (unlikely(hcrx->state == TFRC_RSTATE_TERM)) return; now = ktime_get_real(); switch (fbtype) { case CCID3_FBACK_INITIAL: - hcrx->ccid3hcrx_x_recv = 0; - hcrx->ccid3hcrx_pinv = ~0U; /* see RFC 4342, 8.5 */ + hcrx->x_recv = 0; + hcrx->p_inverse = ~0U; /* see RFC 4342, 8.5 */ break; case CCID3_FBACK_PARAM_CHANGE: /* @@ -699,27 +680,26 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, * the number of bytes since last feedback. * This is a safe fallback, since X is bounded above by X_calc. */ - if (hcrx->ccid3hcrx_x_recv > 0) + if (hcrx->x_recv > 0) break; /* fall through */ case CCID3_FBACK_PERIODIC: - delta = ktime_us_delta(now, hcrx->ccid3hcrx_tstamp_last_feedback); + delta = ktime_us_delta(now, hcrx->tstamp_last_feedback); if (delta <= 0) DCCP_BUG("delta (%ld) <= 0", (long)delta); else - hcrx->ccid3hcrx_x_recv = - scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); + hcrx->x_recv = scaled_div32(hcrx->bytes_recv, delta); break; default: return; } - ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta, - hcrx->ccid3hcrx_x_recv, hcrx->ccid3hcrx_pinv); + ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", + (long)delta, hcrx->x_recv, hcrx->p_inverse); - hcrx->ccid3hcrx_tstamp_last_feedback = now; - hcrx->ccid3hcrx_last_counter = dccp_hdr(skb)->dccph_ccval; - hcrx->ccid3hcrx_bytes_recv = 0; + hcrx->tstamp_last_feedback = now; + hcrx->last_counter = dccp_hdr(skb)->dccph_ccval; + hcrx->bytes_recv = 0; dp->dccps_hc_rx_insert_options = 1; dccp_send_ack(sk); @@ -738,8 +718,8 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) if (dccp_packet_without_ack(skb)) return 0; - x_recv = htonl(hcrx->ccid3hcrx_x_recv); - pinv = htonl(hcrx->ccid3hcrx_pinv); + x_recv = htonl(hcrx->x_recv); + pinv = htonl(hcrx->p_inverse); if (dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE, &pinv, sizeof(pinv)) || @@ -765,22 +745,23 @@ static u32 ccid3_first_li(struct sock *sk) u32 x_recv, p, delta; u64 fval; - if (hcrx->ccid3hcrx_rtt == 0) { + if (hcrx->rtt == 0) { DCCP_WARN("No RTT estimate available, using fallback RTT\n"); - hcrx->ccid3hcrx_rtt = DCCP_FALLBACK_RTT; + hcrx->rtt = DCCP_FALLBACK_RTT; } - delta = ktime_to_us(net_timedelta(hcrx->ccid3hcrx_tstamp_last_feedback)); - x_recv = scaled_div32(hcrx->ccid3hcrx_bytes_recv, delta); + delta = ktime_to_us(net_timedelta(hcrx->tstamp_last_feedback)); + x_recv = scaled_div32(hcrx->bytes_recv, delta); if (x_recv == 0) { /* would also trigger divide-by-zero */ DCCP_WARN("X_recv==0\n"); - if ((x_recv = hcrx->ccid3hcrx_x_recv) == 0) { + if (hcrx->x_recv == 0) { DCCP_BUG("stored value of X_recv is zero"); return ~0U; } + x_recv = hcrx->x_recv; } - fval = scaled_div(hcrx->ccid3hcrx_s, hcrx->ccid3hcrx_rtt); + fval = scaled_div(hcrx->s, hcrx->rtt); fval = scaled_div32(fval, x_recv); p = tfrc_calc_x_reverse_lookup(fval); @@ -797,14 +778,14 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; const bool is_data_packet = dccp_data_packet(skb); - if (unlikely(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)) { + if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA)) { if (is_data_packet) { const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; do_feedback = CCID3_FBACK_INITIAL; ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); - hcrx->ccid3hcrx_s = payload; + hcrx->s = payload; /* - * Not necessary to update ccid3hcrx_bytes_recv here, + * Not necessary to update bytes_recv here, * since X_recv = 0 for the first feedback packet (cf. * RFC 3448, 6.3) -- gerrit */ @@ -812,7 +793,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) goto update_records; } - if (tfrc_rx_hist_duplicate(&hcrx->ccid3hcrx_hist, skb)) + if (tfrc_rx_hist_duplicate(&hcrx->hist, skb)) return; /* done receiving */ if (is_data_packet) { @@ -820,20 +801,20 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) /* * Update moving-average of s and the sum of received payload bytes */ - hcrx->ccid3hcrx_s = tfrc_ewma(hcrx->ccid3hcrx_s, payload, 9); - hcrx->ccid3hcrx_bytes_recv += payload; + hcrx->s = tfrc_ewma(hcrx->s, payload, 9); + hcrx->bytes_recv += payload; } /* * Perform loss detection and handle pending losses */ - if (tfrc_rx_handle_loss(&hcrx->ccid3hcrx_hist, &hcrx->ccid3hcrx_li_hist, + if (tfrc_rx_handle_loss(&hcrx->hist, &hcrx->li_hist, skb, ndp, ccid3_first_li, sk)) { do_feedback = CCID3_FBACK_PARAM_CHANGE; goto done_receiving; } - if (tfrc_rx_hist_loss_pending(&hcrx->ccid3hcrx_hist)) + if (tfrc_rx_hist_loss_pending(&hcrx->hist)) return; /* done receiving */ /* @@ -842,17 +823,17 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) if (unlikely(!is_data_packet)) goto update_records; - if (!tfrc_lh_is_initialised(&hcrx->ccid3hcrx_li_hist)) { - const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->ccid3hcrx_hist, skb); + if (!tfrc_lh_is_initialised(&hcrx->li_hist)) { + const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->hist, skb); /* * Empty loss history: no loss so far, hence p stays 0. * Sample RTT values, since an RTT estimate is required for the * computation of p when the first loss occurs; RFC 3448, 6.3.1. */ if (sample != 0) - hcrx->ccid3hcrx_rtt = tfrc_ewma(hcrx->ccid3hcrx_rtt, sample, 9); + hcrx->rtt = tfrc_ewma(hcrx->rtt, sample, 9); - } else if (tfrc_lh_update_i_mean(&hcrx->ccid3hcrx_li_hist, skb)) { + } else if (tfrc_lh_update_i_mean(&hcrx->li_hist, skb)) { /* * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean * has decreased (resp. p has increased), send feedback now. @@ -863,11 +844,11 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) /* * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 */ - if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->ccid3hcrx_last_counter) > 3) + if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3) do_feedback = CCID3_FBACK_PERIODIC; update_records: - tfrc_rx_hist_add_packet(&hcrx->ccid3hcrx_hist, skb, ndp); + tfrc_rx_hist_add_packet(&hcrx->hist, skb, ndp); done_receiving: if (do_feedback) @@ -878,9 +859,9 @@ static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); - hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA; - tfrc_lh_init(&hcrx->ccid3hcrx_li_hist); - return tfrc_rx_hist_alloc(&hcrx->ccid3hcrx_hist); + hcrx->state = TFRC_RSTATE_NO_DATA; + tfrc_lh_init(&hcrx->li_hist); + return tfrc_rx_hist_alloc(&hcrx->hist); } static void ccid3_hc_rx_exit(struct sock *sk) @@ -889,8 +870,8 @@ static void ccid3_hc_rx_exit(struct sock *sk) ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); - tfrc_rx_hist_purge(&hcrx->ccid3hcrx_hist); - tfrc_lh_cleanup(&hcrx->ccid3hcrx_li_hist); + tfrc_rx_hist_purge(&hcrx->hist); + tfrc_lh_cleanup(&hcrx->li_hist); } static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) @@ -902,9 +883,9 @@ static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) return; hcrx = ccid3_hc_rx_sk(sk); - info->tcpi_ca_state = hcrx->ccid3hcrx_state; + info->tcpi_ca_state = hcrx->state; info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt; + info->tcpi_rcv_rtt = hcrx->rtt; } static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, @@ -923,10 +904,10 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, case DCCP_SOCKOPT_CCID_RX_INFO: if (len < sizeof(rx_info)) return -EINVAL; - rx_info.tfrcrx_x_recv = hcrx->ccid3hcrx_x_recv; - rx_info.tfrcrx_rtt = hcrx->ccid3hcrx_rtt; - rx_info.tfrcrx_p = hcrx->ccid3hcrx_pinv == 0 ? ~0U : - scaled_div(1, hcrx->ccid3hcrx_pinv); + rx_info.tfrcrx_x_recv = hcrx->x_recv; + rx_info.tfrcrx_rtt = hcrx->rtt; + rx_info.tfrcrx_p = hcrx->p_inverse == 0 ? ~0U : + scaled_div(1, hcrx->p_inverse); len = sizeof(rx_info); val = &rx_info; break; diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 49ca32bd7e79..0cfcfff8f5fb 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -77,44 +77,43 @@ enum ccid3_hc_tx_states { /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket * - * @ccid3hctx_x - Current sending rate in 64 * bytes per second - * @ccid3hctx_x_recv - Receive rate in 64 * bytes per second - * @ccid3hctx_x_calc - Calculated rate in bytes per second - * @ccid3hctx_rtt - Estimate of current round trip time in usecs - * @ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000 - * @ccid3hctx_s - Packet size in bytes - * @ccid3hctx_t_rto - Nofeedback Timer setting in usecs - * @ccid3hctx_t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs - * @ccid3hctx_state - Sender state, one of %ccid3_hc_tx_states - * @ccid3hctx_last_win_count - Last window counter sent - * @ccid3hctx_t_last_win_count - Timestamp of earliest packet - * with last_win_count value sent - * @ccid3hctx_no_feedback_timer - Handle to no feedback timer - * @ccid3hctx_t_ld - Time last doubled during slow start - * @ccid3hctx_t_nom - Nominal send time of next packet - * @ccid3hctx_delta - Send timer delta (RFC 3448, 4.6) in usecs - * @ccid3hctx_hist - Packet history - * @ccid3hctx_options_received - Parsed set of retrieved options + * @x - Current sending rate in 64 * bytes per second + * @x_recv - Receive rate in 64 * bytes per second + * @x_calc - Calculated rate in bytes per second + * @rtt - Estimate of current round trip time in usecs + * @p - Current loss event rate (0-1) scaled by 1000000 + * @s - Packet size in bytes + * @t_rto - Nofeedback Timer setting in usecs + * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs + * @state - Sender state, one of %ccid3_hc_tx_states + * @last_win_count - Last window counter sent + * @t_last_win_count - Timestamp of earliest packet with + * last_win_count value sent + * @no_feedback_timer - Handle to no feedback timer + * @t_ld - Time last doubled during slow start + * @t_nom - Nominal send time of next packet + * @delta - Send timer delta (RFC 3448, 4.6) in usecs + * @hist - Packet history + * @options_received - Parsed set of retrieved options */ struct ccid3_hc_tx_sock { - struct tfrc_tx_info ccid3hctx_tfrc; -#define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x -#define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv -#define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc -#define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt -#define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p -#define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto -#define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi - u16 ccid3hctx_s; - enum ccid3_hc_tx_states ccid3hctx_state:8; - u8 ccid3hctx_last_win_count; - ktime_t ccid3hctx_t_last_win_count; - struct timer_list ccid3hctx_no_feedback_timer; - ktime_t ccid3hctx_t_ld; - ktime_t ccid3hctx_t_nom; - u32 ccid3hctx_delta; - struct tfrc_tx_hist_entry *ccid3hctx_hist; - struct ccid3_options_received ccid3hctx_options_received; + u64 x; + u64 x_recv; + u32 x_calc; + u32 rtt; + u32 p; + u32 t_rto; + u32 t_ipi; + u16 s; + enum ccid3_hc_tx_states state:8; + u8 last_win_count; + ktime_t t_last_win_count; + struct timer_list no_feedback_timer; + ktime_t t_ld; + ktime_t t_nom; + u32 delta; + struct tfrc_tx_hist_entry *hist; + struct ccid3_options_received options_received; }; static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) @@ -133,32 +132,32 @@ enum ccid3_hc_rx_states { /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket * - * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448 4.3) - * @ccid3hcrx_rtt - Receiver estimate of rtt (non-standard) - * @ccid3hcrx_p - Current loss event rate (RFC 3448 5.4) - * @ccid3hcrx_last_counter - Tracks window counter (RFC 4342, 8.1) - * @ccid3hcrx_state - Receiver state, one of %ccid3_hc_rx_states - * @ccid3hcrx_bytes_recv - Total sum of DCCP payload bytes - * @ccid3hcrx_x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) - * @ccid3hcrx_rtt - Receiver estimate of RTT - * @ccid3hcrx_tstamp_last_feedback - Time at which last feedback was sent - * @ccid3hcrx_tstamp_last_ack - Time at which last feedback was sent - * @ccid3hcrx_hist - Packet history (loss detection + RTT sampling) - * @ccid3hcrx_li_hist - Loss Interval database - * @ccid3hcrx_s - Received packet size in bytes - * @ccid3hcrx_pinv - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) + * @x_recv - Receiver estimate of send rate (RFC 3448 4.3) + * @rtt - Receiver estimate of rtt (non-standard) + * @p - Current loss event rate (RFC 3448 5.4) + * @last_counter - Tracks window counter (RFC 4342, 8.1) + * @state - Receiver state, one of %ccid3_hc_rx_states + * @bytes_recv - Total sum of DCCP payload bytes + * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) + * @rtt - Receiver estimate of RTT + * @tstamp_last_feedback - Time at which last feedback was sent + * @tstamp_last_ack - Time at which last feedback was sent + * @hist - Packet history (loss detection + RTT sampling) + * @li_hist - Loss Interval database + * @s - Received packet size in bytes + * @p_inverse - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) */ struct ccid3_hc_rx_sock { - u8 ccid3hcrx_last_counter:4; - enum ccid3_hc_rx_states ccid3hcrx_state:8; - u32 ccid3hcrx_bytes_recv; - u32 ccid3hcrx_x_recv; - u32 ccid3hcrx_rtt; - ktime_t ccid3hcrx_tstamp_last_feedback; - struct tfrc_rx_hist ccid3hcrx_hist; - struct tfrc_loss_hist ccid3hcrx_li_hist; - u16 ccid3hcrx_s; -#define ccid3hcrx_pinv ccid3hcrx_li_hist.i_mean + u8 last_counter:4; + enum ccid3_hc_rx_states state:8; + u32 bytes_recv; + u32 x_recv; + u32 rtt; + ktime_t tstamp_last_feedback; + struct tfrc_rx_hist hist; + struct tfrc_loss_hist li_hist; + u16 s; +#define p_inverse li_hist.i_mean }; static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk) diff --git a/net/dccp/probe.c b/net/dccp/probe.c index 9ca783d74678..a87fd4fc1b3d 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c @@ -87,10 +87,8 @@ static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk, "%llu %llu %d\n", NIPQUAD(inet->saddr), ntohs(inet->sport), NIPQUAD(inet->daddr), ntohs(inet->dport), size, - hctx->ccid3hctx_s, hctx->ccid3hctx_rtt, - hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc, - hctx->ccid3hctx_x_recv >> 6, - hctx->ccid3hctx_x >> 6, hctx->ccid3hctx_t_ipi); + hctx->s, hctx->rtt, hctx->p, hctx->x_calc, + hctx->x_recv >> 6, hctx->x >> 6, hctx->t_ipi); else printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n", NIPQUAD(inet->saddr), ntohs(inet->sport), From b2e317f4b5ae73733963c702fae0f246d234100b Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 051/105] dccp ccid-3: No more CCID control blocks in LISTEN state The CCIDs are activated as last of the features, at the end of the handshake, were the LISTEN state of the master socket is inherited into the server state of the child socket. Thus, the only states visible to CCIDs now are OPEN/PARTOPEN, and the closing states. This allows to remove tests which were previously necessary to protect against referencing a socket in the listening state (in CCID3), but which now have become redundant. As a further byproduct of enabling the CCIDs only after the connection has been fully established, several typecast-initialisations of ccid3_hc_{rx,tx}_sock can now be eliminated: * the CCID is loaded, so it is not necessary to test if it is NULL, * if it is possible to load a CCID and leave the private area NULL, then this is a bug, which should crash loudly - and earlier, * the test for state==OPEN || state==PARTOPEN now reduces only to the closing phase (e.g. when the node has received an unexpected Reset). Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/ccids/ccid3.c | 40 +++++++--------------------------------- 1 file changed, 7 insertions(+), 33 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 2f026ce27148..b4cc62e2e2bb 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -565,29 +565,17 @@ static void ccid3_hc_tx_exit(struct sock *sk) static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) { - struct ccid3_hc_tx_sock *hctx; - - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return; - - hctx = ccid3_hc_tx_sk(sk); - info->tcpi_rto = hctx->t_rto; - info->tcpi_rtt = hctx->rtt; + info->tcpi_rto = ccid3_hc_tx_sk(sk)->t_rto; + info->tcpi_rtt = ccid3_hc_tx_sk(sk)->rtt; } static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_tx_sock *hctx; + const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); struct tfrc_tx_info tfrc; const void *val; - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return -EINVAL; - - hctx = ccid3_hc_tx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_TX_INFO: if (len < sizeof(tfrc)) @@ -707,14 +695,12 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) { - const struct ccid3_hc_rx_sock *hcrx; + const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); __be32 x_recv, pinv; if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) return 0; - hcrx = ccid3_hc_rx_sk(sk); - if (dccp_packet_without_ack(skb)) return 0; @@ -876,30 +862,18 @@ static void ccid3_hc_rx_exit(struct sock *sk) static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) { - const struct ccid3_hc_rx_sock *hcrx; - - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return; - - hcrx = ccid3_hc_rx_sk(sk); - info->tcpi_ca_state = hcrx->state; + info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->state; info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = hcrx->rtt; + info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rtt; } static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_rx_sock *hcrx; + const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); struct tfrc_rx_info rx_info; const void *val; - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return -EINVAL; - - hcrx = ccid3_hc_rx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_RX_INFO: if (len < sizeof(rx_info)) From de6f2b59e5cd15a8772adb732a1d80e141a77115 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 052/105] dccp ccid-3: Bug fix for the inter-packet scheduling algorithm This fixes a subtle bug in the calculation of the inter-packet gap and shows that t_delta, as it is currently used, is not needed. And hence replaced. The algorithm from RFC 3448, 4.6 below continually computes a send time t_nom, which is initialised with the current time t_now; t_gran = 1E6 / HZ specifies the scheduling granularity, s the packet size, and X the sending rate: t_distance = t_nom - t_now; // in microseconds t_delta = min(t_ipi, t_gran) / 2; // `delta' parameter in microseconds if (t_distance >= t_delta) { reschedule after (t_distance / 1000) milliseconds; } else { t_ipi = s / X; // inter-packet interval in usec t_nom += t_ipi; // compute the next send time send packet now; } 1) Description of the bug ------------------------- Rescheduling requires a conversion into milliseconds, due to this call chain: * ccid3_hc_tx_send_packet() returns a timeout in milliseconds, * this value is converted by msecs_to_jiffies() in dccp_write_xmit(), * and finally used as jiffy-expires-value for sk_reset_timer(). The highest jiffy resolution with HZ=1000 is 1 millisecond, so using a higher granularity does not make much sense here. As a consequence, values of t_distance < 1000 are truncated to 0. This issue has so far been resolved by using instead if (t_distance >= t_delta + 1000) reschedule after (t_distance / 1000) milliseconds; The bug is in artificially inflating t_delta to t_delta' = t_delta + 1000. This is unnecessarily large, a more adequate value is t_delta' = max(t_delta, 1000). 2) Consequences of using the corrected t_delta' ----------------------------------------------- Since t_delta <= t_gran/2 = 10^6/(2*HZ), we have t_delta <= 1000 as long as HZ >= 500. This means that t_delta' = max(1000, t_delta) is constant at 1000. On the other hand, when using a coarse HZ value of HZ < 500, we have three sub-cases that can all be reduced to using another constant of t_gran/2. (a) The first case arises when t_ipi > t_gran. Here t_delta' is the constant t_delta' = max(1000, t_gran/2) = t_gran/2. (b) If t_ipi <= 2000 < t_gran = 10^6/HZ usec, then t_delta = t_ipi/2 <= 1000, so that t_delta' = max(1000, t_delta) = 1000 < t_gran/2. (c) If 2000 < t_ipi <= t_gran, we have t_delta' = max(t_delta, 1000) = t_ipi/2. In the second and third cases we have delay values less than t_gran/2, which is in the order of less than or equal to half a jiffy. How these are treated depends on how fractions of a jiffy are handled: they are either always rounded down to 0, or always rounded up to 1 jiffy (assuming non-zero values). In both cases the error is on average in the order of 50%. Thus we are not increasing the error when in the second/third case we replace a value less than t_gran/2 with 0, by setting t_delta' to the constant t_gran/2. 3) Summary ---------- Fixing (1) and considering (2), the patch replaces t_delta with a constant, whose value depends on CONFIG_HZ, changing the above algorithm to: if (t_distance >= t_delta') reschedule after (t_distance / 1000) milliseconds; where t_delta' = 10^6/(2*HZ) if HZ < 500, and t_delta' = 1000 otherwise. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 17 +++++++---------- net/dccp/ccids/ccid3.h | 19 ++++++++++++++----- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index b4cc62e2e2bb..eb1bda08eeb2 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -93,19 +93,16 @@ static inline u64 rfc3390_initial_rate(struct sock *sk) return scaled_div(w_init << 6, hctx->rtt); } -/* - * Recalculate t_ipi and delta (should be called whenever X changes) +/** + * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst + * This respects the granularity of X_inst (64 * bytes/second). */ static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) { - /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x); - /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ - hctx->delta = min_t(u32, hctx->t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN); - - ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", hctx->t_ipi, - hctx->delta, hctx->s, (unsigned)(hctx->x >> 6)); + ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hctx->t_ipi, + hctx->s, (unsigned)(hctx->x >> 6)); } static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) @@ -340,8 +337,8 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) * else * // send the packet in (t_nom - t_now) milliseconds. */ - if (delay - (s64)hctx->delta >= 1000) - return (u32)delay / 1000L; + if (delay >= TFRC_T_DELTA) + return (u32)delay / USEC_PER_MSEC; ccid3_hc_tx_update_win_count(hctx, now); break; diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 0cfcfff8f5fb..92a5e1688a07 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -47,12 +47,23 @@ /* Two seconds as per RFC 3448 4.2 */ #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) -/* In usecs - half the scheduling granularity as per RFC3448 4.6 */ -#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) - /* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ #define TFRC_T_MBI 64 +/* + * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are + * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. + * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse + * resolution of HZ < 500 means that the error is below one timer tick (t_gran) + * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). + */ +#if (HZ >= 500) +# define TFRC_T_DELTA USEC_PER_MSEC +#else +# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) +#warning Coarse CONFIG_HZ resolution -- higher value recommended for TFRC. +#endif + enum ccid3_options { TFRC_OPT_LOSS_EVENT_RATE = 192, TFRC_OPT_LOSS_INTERVALS = 193, @@ -92,7 +103,6 @@ enum ccid3_hc_tx_states { * @no_feedback_timer - Handle to no feedback timer * @t_ld - Time last doubled during slow start * @t_nom - Nominal send time of next packet - * @delta - Send timer delta (RFC 3448, 4.6) in usecs * @hist - Packet history * @options_received - Parsed set of retrieved options */ @@ -111,7 +121,6 @@ struct ccid3_hc_tx_sock { struct timer_list no_feedback_timer; ktime_t t_ld; ktime_t t_nom; - u32 delta; struct tfrc_tx_hist_entry *hist; struct ccid3_options_received options_received; }; From 63b3a73bb85daf441f964aaf9b3fc89be4209c23 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 053/105] dccp ccid-3: Remove ugly RTT-sampling history lookup This removes the RTT-sampling function tfrc_tx_hist_rtt(), since 1. it suffered from complex passing of return values (the return value both indicated successful lookup while the value doubled as RTT sample); 2. when for some odd reason the sample value equalled 0, this triggered a bug warning about "bogus Ack", due to the ambiguity of the return value; 3. on a passive host which has not sent anything the TX history is empty and thus will lead to unwanted "bogus Ack" warnings such as ccid3_hc_tx_packet_recv: server(e7b7d518): DATAACK with bogus ACK-28197148 ccid3_hc_tx_packet_recv: server(e7b7d518): DATAACK with bogus ACK-26641606. The fix is to replace the implicit encoding by performing the steps manually. Furthermore, the "bogus Ack" warning has been removed, since it can actually be triggered due to several reasons (network reordering, old packet, (3) above), hence it is not very useful. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 34 +++++++++++++----------- net/dccp/ccids/lib/packet_history.c | 40 ----------------------------- net/dccp/ccids/lib/packet_history.h | 22 +++++++++++++--- 3 files changed, 38 insertions(+), 58 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index eb1bda08eeb2..f74e58d9793f 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -371,6 +371,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); struct ccid3_options_received *opt_recv = &hctx->options_received; + struct tfrc_tx_hist_entry *acked; ktime_t now; unsigned long t_nfb; u32 pinv, r_sample; @@ -384,17 +385,24 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hctx->state != TFRC_SSTATE_NO_FBACK) return; - now = ktime_get_real(); - - /* Estimate RTT from history if ACK number is valid */ - r_sample = tfrc_tx_hist_rtt(hctx->hist, - DCCP_SKB_CB(skb)->dccpd_ack_seq, now); - if (r_sample == 0) { - DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk, - dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type), - (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq); + /* + * Locate the acknowledged packet in the TX history. + * + * Returning "entry not found" here can for instance happen when + * - the host has not sent out anything (e.g. a passive server), + * - the Ack is outdated (packet with higher Ack number was received), + * - it is a bogus Ack (for a packet not sent on this connection). + */ + acked = tfrc_tx_hist_find_entry(hctx->hist, dccp_hdr_ack_seq(skb)); + if (acked == NULL) return; - } + /* For the sake of RTT sampling, ignore/remove all older entries */ + tfrc_tx_hist_purge(&acked->next); + + /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ + now = ktime_get_real(); + r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); + hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9); /* Update receive rate in units of 64 * bytes/second */ hctx->x_recv = opt_recv->ccid3or_receive_rate; @@ -406,11 +414,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hctx->p = 0; else /* can not exceed 100% */ hctx->p = scaled_div(1, pinv); - /* - * Validate new RTT sample and update moving average - */ - r_sample = dccp_sample_rtt(sk, r_sample); - hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9); + /* * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 6cc108afdc3b..5c4450866904 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -40,18 +40,6 @@ #include "packet_history.h" #include "../../dccp.h" -/** - * tfrc_tx_hist_entry - Simple singly-linked TX history list - * @next: next oldest entry (LIFO order) - * @seqno: sequence number of this entry - * @stamp: send time of packet with sequence number @seqno - */ -struct tfrc_tx_hist_entry { - struct tfrc_tx_hist_entry *next; - u64 seqno; - ktime_t stamp; -}; - /* * Transmitter History Routines */ @@ -73,15 +61,6 @@ void tfrc_tx_packet_history_exit(void) } } -static struct tfrc_tx_hist_entry * - tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) -{ - while (head != NULL && head->seqno != seqno) - head = head->next; - - return head; -} - int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) { struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); @@ -111,25 +90,6 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) } EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); -u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno, - const ktime_t now) -{ - u32 rtt = 0; - struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno); - - if (packet != NULL) { - rtt = ktime_us_delta(now, packet->stamp); - /* - * Garbage-collect older (irrelevant) entries: - */ - tfrc_tx_hist_purge(&packet->next); - } - - return rtt; -} -EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt); - - /* * Receiver History Routines */ diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 461cc91cce88..221d8102da51 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -40,12 +40,28 @@ #include #include "tfrc.h" -struct tfrc_tx_hist_entry; +/** + * tfrc_tx_hist_entry - Simple singly-linked TX history list + * @next: next oldest entry (LIFO order) + * @seqno: sequence number of this entry + * @stamp: send time of packet with sequence number @seqno + */ +struct tfrc_tx_hist_entry { + struct tfrc_tx_hist_entry *next; + u64 seqno; + ktime_t stamp; +}; + +static inline struct tfrc_tx_hist_entry * + tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) +{ + while (head != NULL && head->seqno != seqno) + head = head->next; + return head; +} extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); -extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, - const u64 seqno, const ktime_t now); /* Subtraction a-b modulo-16, respects circular wrap-around */ #define SUB16(a, b) (((a) + 16 - (b)) & 0xF) From 47a61e7b433a014296971ea1226eb1adb6310ab4 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 054/105] dccp ccid-3: Simplify and consolidate tx_parse_options This simplifies and consolidates the TX option-parsing code: 1. The Loss Intervals option is not currently used, so dead code related to this option is removed. I am aware of no plans to support the option, but if someone wants to implement it (e.g. for inter-op tests), it is better to start afresh than having to also update currently unused code. 2. The Loss Event and Receive Rate options have a lot of code in common (both are 32 bit, both have same length etc.), so this is consolidated. 3. The test against GSR is not necessary, because - on first loading CCID3, ccid_new() zeroes out all fields in the socket; - ccid3_hc_tx_packet_recv() treats 0 and ~0U equivalently, due to pinv = opt_recv->ccid3or_loss_event_rate; if (pinv == ~0U || pinv == 0) hctx->p = 0; - as a result, the sequence number field is removed from opt_recv. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 61 +++++++++++------------------------------- net/dccp/ccids/ccid3.h | 3 --- 2 files changed, 16 insertions(+), 48 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index f74e58d9793f..12b601f11bfd 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -487,60 +487,31 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, unsigned char len, u16 idx, unsigned char *value) { - int rc = 0; - const struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); struct ccid3_options_received *opt_recv = &hctx->options_received; __be32 opt_val; - if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { - opt_recv->ccid3or_seqno = dp->dccps_gsr; - opt_recv->ccid3or_loss_event_rate = ~0; - opt_recv->ccid3or_loss_intervals_idx = 0; - opt_recv->ccid3or_loss_intervals_len = 0; - opt_recv->ccid3or_receive_rate = 0; - } - switch (option) { + case TFRC_OPT_RECEIVE_RATE: case TFRC_OPT_LOSS_EVENT_RATE: if (unlikely(len != 4)) { - DCCP_WARN("%s(%p), invalid len %d " - "for TFRC_OPT_LOSS_EVENT_RATE\n", - dccp_role(sk), sk, len); - rc = -EINVAL; - } else { - opt_val = get_unaligned((__be32 *)value); - opt_recv->ccid3or_loss_event_rate = ntohl(opt_val); - ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", - dccp_role(sk), sk, - opt_recv->ccid3or_loss_event_rate); + DCCP_WARN("%s(%p), invalid len %d for %u\n", + dccp_role(sk), sk, len, option); + return -EINVAL; } - break; - case TFRC_OPT_LOSS_INTERVALS: - opt_recv->ccid3or_loss_intervals_idx = idx; - opt_recv->ccid3or_loss_intervals_len = len; - ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n", - dccp_role(sk), sk, - opt_recv->ccid3or_loss_intervals_idx, - opt_recv->ccid3or_loss_intervals_len); - break; - case TFRC_OPT_RECEIVE_RATE: - if (unlikely(len != 4)) { - DCCP_WARN("%s(%p), invalid len %d " - "for TFRC_OPT_RECEIVE_RATE\n", - dccp_role(sk), sk, len); - rc = -EINVAL; - } else { - opt_val = get_unaligned((__be32 *)value); - opt_recv->ccid3or_receive_rate = ntohl(opt_val); - ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", - dccp_role(sk), sk, - opt_recv->ccid3or_receive_rate); - } - break; - } + opt_val = ntohl(get_unaligned((__be32 *)value)); - return rc; + if (option == TFRC_OPT_RECEIVE_RATE) { + opt_recv->ccid3or_receive_rate = opt_val; + ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", + dccp_role(sk), sk, opt_val); + } else { + opt_recv->ccid3or_loss_event_rate = opt_val; + ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", + dccp_role(sk), sk, opt_val); + } + } + return 0; } static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 92a5e1688a07..2268785ae8ee 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -71,9 +71,6 @@ enum ccid3_options { }; struct ccid3_options_received { - u64 ccid3or_seqno:48, - ccid3or_loss_intervals_idx:16; - u16 ccid3or_loss_intervals_len; u32 ccid3or_loss_event_rate; u32 ccid3or_receive_rate; }; From 3306c781ff13aea89606435c134ec84e3c608681 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 055/105] dccp: Add packet type information to CCID-specific option parsing This patch ... 1. adds packet type information to ccid_hc_{rx,tx}_parse_options(). This is necessary, since table 3 in RFC 4340, 5.8 leaves it to the CCIDs to state which options may (not) appear on what packet type. 2. adds such a check for CCID-3's {Loss Event, Receive} Rate as specified in RFC 4340 8.3 ("Receive Rate options MUST NOT be sent on DCCP-Data packets") and 8.5 ("Loss Event Rate options MUST NOT be sent on DCCP-Data packets"). 3. removes an unused argument `idx' from ccid_hc_{rx,tx}_parse_options(). This is also no longer necessary, since the CCID-specific option-parsing routines are passed every single parameter of the type-length-value option encoding. Also added documentation and made argument naming scheme consistent. Signed-off-by: Gerrit Renker --- net/dccp/ccid.h | 46 +++++++++++++++++++++--------------------- net/dccp/ccids/ccid3.c | 14 +++++++------ net/dccp/options.c | 16 ++++----------- 3 files changed, 35 insertions(+), 41 deletions(-) diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 20ba066b2775..bce517c99996 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -60,18 +60,14 @@ struct ccid_operations { void (*ccid_hc_tx_exit)(struct sock *sk); void (*ccid_hc_rx_packet_recv)(struct sock *sk, struct sk_buff *skb); - int (*ccid_hc_rx_parse_options)(struct sock *sk, - unsigned char option, - unsigned char len, u16 idx, - unsigned char* value); + int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, + u8 opt, u8 *val, u8 len); int (*ccid_hc_rx_insert_options)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_recv)(struct sock *sk, struct sk_buff *skb); - int (*ccid_hc_tx_parse_options)(struct sock *sk, - unsigned char option, - unsigned char len, u16 idx, - unsigned char* value); + int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, + u8 opt, u8 *val, u8 len); int (*ccid_hc_tx_send_packet)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_sent)(struct sock *sk, @@ -163,27 +159,31 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); } +/** + * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver + * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) + * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) + * @val: value of @opt + * @len: length of @val in bytes + */ static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, - unsigned char option, - unsigned char len, u16 idx, - unsigned char* value) + u8 pkt, u8 opt, u8 *val, u8 len) { - int rc = 0; - if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL) - rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx, - value); - return rc; + if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL) + return 0; + return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); } +/** + * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender + * Arguments are analogous to ccid_hc_tx_parse_options() + */ static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, - unsigned char option, - unsigned char len, u16 idx, - unsigned char* value) + u8 pkt, u8 opt, u8 *val, u8 len) { - int rc = 0; - if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL) - rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value); - return rc; + if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL) + return 0; + return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); } static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 12b601f11bfd..4c422fb2189f 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -483,9 +483,8 @@ done_computing_x: jiffies + usecs_to_jiffies(t_nfb)); } -static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, - unsigned char len, u16 idx, - unsigned char *value) +static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, + u8 option, u8 *optval, u8 optlen) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); struct ccid3_options_received *opt_recv = &hctx->options_received; @@ -494,12 +493,15 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, switch (option) { case TFRC_OPT_RECEIVE_RATE: case TFRC_OPT_LOSS_EVENT_RATE: - if (unlikely(len != 4)) { + /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ + if (packet_type == DCCP_PKT_DATA) + break; + if (unlikely(optlen != 4)) { DCCP_WARN("%s(%p), invalid len %d for %u\n", - dccp_role(sk), sk, len, option); + dccp_role(sk), sk, optlen, option); return -EINVAL; } - opt_val = ntohl(get_unaligned((__be32 *)value)); + opt_val = ntohl(get_unaligned((__be32 *)optval)); if (option == TFRC_OPT_RECEIVE_RATE) { opt_recv->ccid3or_receive_rate = opt_val; diff --git a/net/dccp/options.c b/net/dccp/options.c index fd51cc70c63e..b102774694c6 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -226,23 +226,15 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", dccp_role(sk), elapsed_time); break; - case 128 ... 191: { - const u16 idx = value - options; - + case 128 ... 191: if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, - opt, len, idx, - value) != 0) + pkt_type, opt, value, len)) goto out_invalid_option; - } break; - case 192 ... 255: { - const u16 idx = value - options; - + case 192 ... 255: if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, - opt, len, idx, - value) != 0) + pkt_type, opt, value, len)) goto out_invalid_option; - } break; default: DCCP_CRIT("DCCP(%p): option %d(len=%d) not " From 535c55df136ad2783d444e54d518a8fae8bdbf79 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 056/105] dccp tfrc/ccid-3: Computing Loss Rate from Loss Event Rate This adds a function to take care of the following cases occurring in the computation of the Loss Rate p: * 1/(2^32-1) is mapped into 0% as per RFC 4342, 8.5; * 1/0 is mapped into the maximum of 100%; * we want to avoid that p = 1/x is rounded down to 0 when x is very large, since this means accidentally re-entering slow-start (indicated by p==0). In the last case, the minimum-resolution value of p is returned. Furthermore, a bug in ccid3_hc_rx_getsockopt is fixed (1/0 was mapped into ~0U), which now allows to consistently print the scaled p-values as printf("Loss Event Rate = %u.%04u %%\n", rx_info.tfrcrx_p / 10000, rx_info.tfrcrx_p % 10000); Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 9 ++++----- net/dccp/ccids/lib/tfrc.h | 1 + net/dccp/ccids/lib/tfrc_equation.c | 17 +++++++++++++++-- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 4c422fb2189f..206204551f4d 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -410,10 +410,10 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* Update loss event rate (which is scaled by 1e6) */ pinv = opt_recv->ccid3or_loss_event_rate; - if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ + if (pinv == 0) hctx->p = 0; - else /* can not exceed 100% */ - hctx->p = scaled_div(1, pinv); + else + hctx->p = tfrc_invert_loss_event_rate(pinv); /* * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 @@ -854,8 +854,7 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, return -EINVAL; rx_info.tfrcrx_x_recv = hcrx->x_recv; rx_info.tfrcrx_rtt = hcrx->rtt; - rx_info.tfrcrx_p = hcrx->p_inverse == 0 ? ~0U : - scaled_div(1, hcrx->p_inverse); + rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hcrx->p_inverse); len = sizeof(rx_info); val = &rx_info; break; diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h index ed9857527acf..bb47146ac7d1 100644 --- a/net/dccp/ccids/lib/tfrc.h +++ b/net/dccp/ccids/lib/tfrc.h @@ -58,6 +58,7 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); +extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); extern int tfrc_tx_packet_history_init(void); extern void tfrc_tx_packet_history_exit(void); diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index 2f20a29cffe4..bc3dc2b2a849 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -658,7 +658,6 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) result = scaled_div(s, R); return scaled_div32(result, f); } - EXPORT_SYMBOL_GPL(tfrc_calc_x); /** @@ -693,5 +692,19 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) index = tfrc_binsearch(fvalue, 0); return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; } - EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); + +/** + * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% + * When @loss_event_rate is large, there is a chance that p is truncated to 0. + * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. + */ +u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) +{ + if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ + return 0; + if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ + return 1000000; + return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); +} +EXPORT_SYMBOL_GPL(tfrc_invert_loss_event_rate); From ce177ae2e6b196659e93a9408cc1f5f13f206d13 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 057/105] dccp ccid-3: Remove redundant 'options_received' struct The `options_received' struct is redundant, since it re-duplicates the existing `p' and `x_recv' fields. This patch removes the sub-struct and migrates the format conversion operations (cf. below) to ccid3_hc_tx_parse_options(). Why the fields are redundant ---------------------------- The Loss Event Rate p and the Receive Rate x_recv are initially 0 when first loading CCID-3, as ccid_new() zeroes out the entire ccid3_hc_tx_sock. When Loss Event Rate or Receive Rate options are received, they are stored by ccid3_hc_tx_parse_options() into the fields `ccid3or_loss_event_rate' and `ccid3or_receive_rate' of the sub-struct `options_received' in ccid3_hc_tx_sock. After parsing (considering only the established state - dccp_rcv_established()), the packet is passed on to ccid_hc_tx_packet_recv(). This calls the CCID-3 specific routine ccid3_hc_tx_packet_recv(), which performs the following copy operations between fields of ccid3_hc_tx_sock: * hctx->options_received.ccid3or_receive_rate is copied into hctx->x_recv, after scaling it for fixpoint arithmetic, by 2^64; * hctx->options_received.ccid3or_loss_event_rate is copied into hctx->p, considering the above special cases; in addition, a value of 0 here needs to be mapped into p=0 (when no Loss Event Rate option has been received yet). Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 24 ++++++++---------------- net/dccp/ccids/ccid3.h | 7 ------- 2 files changed, 8 insertions(+), 23 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 206204551f4d..e7db8a4ee642 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -370,11 +370,10 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv = &hctx->options_received; struct tfrc_tx_hist_entry *acked; ktime_t now; unsigned long t_nfb; - u32 pinv, r_sample; + u32 r_sample; /* we are only interested in ACKs */ if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || @@ -404,17 +403,6 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); hctx->rtt = tfrc_ewma(hctx->rtt, r_sample, 9); - /* Update receive rate in units of 64 * bytes/second */ - hctx->x_recv = opt_recv->ccid3or_receive_rate; - hctx->x_recv <<= 6; - - /* Update loss event rate (which is scaled by 1e6) */ - pinv = opt_recv->ccid3or_loss_event_rate; - if (pinv == 0) - hctx->p = 0; - else - hctx->p = tfrc_invert_loss_event_rate(pinv); - /* * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 */ @@ -487,7 +475,6 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, u8 option, u8 *optval, u8 optlen) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv = &hctx->options_received; __be32 opt_val; switch (option) { @@ -504,11 +491,16 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, opt_val = ntohl(get_unaligned((__be32 *)optval)); if (option == TFRC_OPT_RECEIVE_RATE) { - opt_recv->ccid3or_receive_rate = opt_val; + /* Receive Rate is kept in units of 64 bytes/second */ + hctx->x_recv = opt_val; + hctx->x_recv <<= 6; + ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", dccp_role(sk), sk, opt_val); } else { - opt_recv->ccid3or_loss_event_rate = opt_val; + /* Update the fixpoint Loss Event Rate fraction */ + hctx->p = tfrc_invert_loss_event_rate(opt_val); + ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", dccp_role(sk), sk, opt_val); } diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 2268785ae8ee..ae210786c895 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -70,11 +70,6 @@ enum ccid3_options { TFRC_OPT_RECEIVE_RATE = 194, }; -struct ccid3_options_received { - u32 ccid3or_loss_event_rate; - u32 ccid3or_receive_rate; -}; - /* TFRC sender states */ enum ccid3_hc_tx_states { TFRC_SSTATE_NO_SENT = 1, @@ -101,7 +96,6 @@ enum ccid3_hc_tx_states { * @t_ld - Time last doubled during slow start * @t_nom - Nominal send time of next packet * @hist - Packet history - * @options_received - Parsed set of retrieved options */ struct ccid3_hc_tx_sock { u64 x; @@ -119,7 +113,6 @@ struct ccid3_hc_tx_sock { ktime_t t_ld; ktime_t t_nom; struct tfrc_tx_hist_entry *hist; - struct ccid3_options_received options_received; }; static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) From f10ecaee6dc2c6d56783462b2a82e98bc81b55f4 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 058/105] dccp: Replace magic CCID-specific numbers by symbolic constants The constants DCCPO_{MIN,MAX}_CCID_SPECIFIC are nowhere used in the code, but instead for the CCID-specific options numbers are used. This patch unifies the use of CCID-specific option numbers, by adding symbolic names reflecting the definitions in RFC 4340, 10.3. Signed-off-by: Gerrit Renker --- include/linux/dccp.h | 6 ++++-- net/dccp/options.c | 13 +++---------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 7434a8353e23..7187bd8a75f6 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -165,8 +165,10 @@ enum { DCCPO_TIMESTAMP_ECHO = 42, DCCPO_ELAPSED_TIME = 43, DCCPO_MAX = 45, - DCCPO_MIN_CCID_SPECIFIC = 128, - DCCPO_MAX_CCID_SPECIFIC = 255, + DCCPO_MIN_RX_CCID_SPECIFIC = 128, /* from sender to receiver */ + DCCPO_MAX_RX_CCID_SPECIFIC = 191, + DCCPO_MIN_TX_CCID_SPECIFIC = 192, /* from receiver to sender */ + DCCPO_MAX_TX_CCID_SPECIFIC = 255, }; /* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */ #define DCCP_SINGLE_OPT_MAXLEN 253 diff --git a/net/dccp/options.c b/net/dccp/options.c index b102774694c6..e1c94e2b8be0 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -96,18 +96,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, } /* - * CCID-Specific Options (from RFC 4340, sec. 10.3): - * - * Option numbers 128 through 191 are for options sent from the - * HC-Sender to the HC-Receiver; option numbers 192 through 255 - * are for options sent from the HC-Receiver to the HC-Sender. - * * CCID-specific options are ignored during connection setup, as * negotiation may still be in progress (see RFC 4340, 10.3). * The same applies to Ack Vectors, as these depend on the CCID. - * */ - if (dreq != NULL && (opt >= 128 || + if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) goto ignore_option; @@ -226,12 +219,12 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", dccp_role(sk), elapsed_time); break; - case 128 ... 191: + case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, pkt_type, opt, value, len)) goto out_invalid_option; break; - case 192 ... 255: + case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, pkt_type, opt, value, len)) goto out_invalid_option; From c506d91d9ab7681e058afcd750e9118c6cdaabc1 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 059/105] dccp: Unused argument in CCID tx function This removes the argument `more' from ccid_hc_tx_packet_sent, since it was nowhere used in the entire code. (Anecdotally, this argument was not even used in the original KAME code where the function originally came from; compare the variable moreToSend in the freebsd61-dccp-kame-28.08.2006.patch now maintained by Emmanuel Lochin.) Signed-off-by: Gerrit Renker --- net/dccp/ccid.h | 6 +++--- net/dccp/ccids/ccid2.c | 2 +- net/dccp/ccids/ccid3.c | 3 +-- net/dccp/output.c | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index bce517c99996..430a3c24f970 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -71,7 +71,7 @@ struct ccid_operations { int (*ccid_hc_tx_send_packet)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_sent)(struct sock *sk, - int more, unsigned int len); + unsigned int len); void (*ccid_hc_rx_get_info)(struct sock *sk, struct tcp_info *info); void (*ccid_hc_tx_get_info)(struct sock *sk, @@ -139,10 +139,10 @@ static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, } static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, - int more, unsigned int len) + unsigned int len) { if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) - ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len); + ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); } static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 9728bbf0acea..f56ab68a4b78 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -221,7 +221,7 @@ static void ccid2_start_rto_timer(struct sock *sk) jiffies + hctx->rto); } -static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) +static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index e7db8a4ee642..c1cc66edfad4 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -356,8 +356,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) return 0; } -static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, - unsigned int len) +static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); diff --git a/net/dccp/output.c b/net/dccp/output.c index 9888f61f9b0b..be65bc30e201 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -301,7 +301,7 @@ void dccp_write_xmit(struct sock *sk, int block) dcb->dccpd_type = DCCP_PKT_DATA; err = dccp_transmit_skb(sk, skb); - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len); + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); if (err) DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", err); From 5fe94963a163fecc34b7b51bf2ca525f9f50d7bf Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 060/105] dccp ccid-3: Remove duplicate documentation This removes RX-socket documentation which is either duplicate or non-existent. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index ae210786c895..de26db82d65f 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -131,16 +131,12 @@ enum ccid3_hc_rx_states { /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket * - * @x_recv - Receiver estimate of send rate (RFC 3448 4.3) - * @rtt - Receiver estimate of rtt (non-standard) - * @p - Current loss event rate (RFC 3448 5.4) * @last_counter - Tracks window counter (RFC 4342, 8.1) * @state - Receiver state, one of %ccid3_hc_rx_states * @bytes_recv - Total sum of DCCP payload bytes * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) * @rtt - Receiver estimate of RTT * @tstamp_last_feedback - Time at which last feedback was sent - * @tstamp_last_ack - Time at which last feedback was sent * @hist - Packet history (loss detection + RTT sampling) * @li_hist - Loss Interval database * @s - Received packet size in bytes From d0995e6a9e3328cdc76b4c45882dee118284f960 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 061/105] dccp ccid-3: Remove dead states This patch is thanks to an investigation by Leandro Sales de Melo and his colleagues. They worked out two state diagrams which highlight the fact that the xxx_TERM states in CCID-3/4 are in fact not necessary. And this can be confirmed by in turn looking at the code: the xxx_TERM states are only ever set in ccid3_hc_{rx,tx}_exit(). These two functions are part of the following call chain: * ccid_hc_{tx,rx}_exit() are called from ccid_delete() only; * ccid_delete() invokes ccid_hc_{tx,rx}_exit() in the way of a destructor: after calling ccid_hc_{tx,rx}_exit(), the CCID is released from memory; * ccid_delete() is in turn called only by ccid_hc_{tx,rx}_delete(); * ccid_hc_{tx,rx}_delete() is called only if - feature negotiation failed (dccp_feat_activate_values()), - when changing the RX/TX CCID (to eject the current CCID), - when destroying the socket (in dccp_destroy_sock()). In other words, when CCID-3 sets the state to xxx_TERM, it is at a time where no more processing should be going on, hence it is not necessary to introduce a dedicated exit state - this is implicit when unloading the CCID. The patch removes this state, one switch-statement collapses as a result. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 37 +++++++++---------------------------- net/dccp/ccids/ccid3.h | 2 -- 2 files changed, 9 insertions(+), 30 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index c1cc66edfad4..0751a8fa936a 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -56,7 +56,6 @@ static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) [TFRC_SSTATE_NO_SENT] = "NO_SENT", [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", [TFRC_SSTATE_FBACK] = "FBACK", - [TFRC_SSTATE_TERM] = "TERM", }; return ccid3_state_names[state]; @@ -210,10 +209,13 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, ccid3_tx_state_name(hctx->state)); + /* Ignore and do not restart after leaving the established state */ + if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) + goto out; + + /* Reset feedback state to "no feedback received" */ if (hctx->state == TFRC_SSTATE_FBACK) ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - else if (hctx->state != TFRC_SSTATE_NO_FBACK) - goto out; /* * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 @@ -288,8 +290,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) if (unlikely(skb->len == 0)) return -EBADMSG; - switch (hctx->state) { - case TFRC_SSTATE_NO_SENT: + if (hctx->state == TFRC_SSTATE_NO_SENT) { sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); hctx->last_win_count = 0; @@ -324,9 +325,8 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) ccid3_update_send_interval(hctx); ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - break; - case TFRC_SSTATE_NO_FBACK: - case TFRC_SSTATE_FBACK: + + } else { delay = ktime_us_delta(hctx->t_nom, now); ccid3_pr_debug("delay=%ld\n", (long)delay); /* @@ -341,10 +341,6 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) return (u32)delay / USEC_PER_MSEC; ccid3_hc_tx_update_win_count(hctx, now); - break; - case TFRC_SSTATE_TERM: - DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk); - return -EINVAL; } /* prepare to send now (add options etc.) */ @@ -378,11 +374,6 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) return; - /* ... and only in the established state */ - if (hctx->state != TFRC_SSTATE_FBACK && - hctx->state != TFRC_SSTATE_NO_FBACK) - return; - /* * Locate the acknowledged packet in the TX history. * @@ -522,9 +513,7 @@ static void ccid3_hc_tx_exit(struct sock *sk) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); sk_stop_timer(sk, &hctx->no_feedback_timer); - tfrc_tx_hist_purge(&hctx->hist); } @@ -583,7 +572,6 @@ static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) static char *ccid3_rx_state_names[] = { [TFRC_RSTATE_NO_DATA] = "NO_DATA", [TFRC_RSTATE_DATA] = "DATA", - [TFRC_RSTATE_TERM] = "TERM", }; return ccid3_rx_state_names[state]; @@ -609,14 +597,9 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); struct dccp_sock *dp = dccp_sk(sk); - ktime_t now; + ktime_t now = ktime_get_real(); s64 delta = 0; - if (unlikely(hcrx->state == TFRC_RSTATE_TERM)) - return; - - now = ktime_get_real(); - switch (fbtype) { case CCID3_FBACK_INITIAL: hcrx->x_recv = 0; @@ -819,8 +802,6 @@ static void ccid3_hc_rx_exit(struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); - tfrc_rx_hist_purge(&hcrx->hist); tfrc_lh_cleanup(&hcrx->li_hist); } diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index de26db82d65f..7884159937aa 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -75,7 +75,6 @@ enum ccid3_hc_tx_states { TFRC_SSTATE_NO_SENT = 1, TFRC_SSTATE_NO_FBACK, TFRC_SSTATE_FBACK, - TFRC_SSTATE_TERM, }; /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket @@ -126,7 +125,6 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) enum ccid3_hc_rx_states { TFRC_RSTATE_NO_DATA = 1, TFRC_RSTATE_DATA, - TFRC_RSTATE_TERM = 127, }; /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket From 2975abd251d795810932b20354729ba236d95bf9 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 062/105] dccp: Schedule an Ack when receiving timestamps This schedules an Ack when receiving a timestamp, exploiting the existing inet_csk_schedule_ack() function, saving one case in the `dccp_ack_pending()' function. Signed-off-by: Gerrit Renker --- net/dccp/dccp.h | 2 +- net/dccp/options.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index c7370de4c4d5..f9ed0cbd1bf3 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -431,7 +431,7 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq) static inline int dccp_ack_pending(const struct sock *sk) { const struct dccp_sock *dp = dccp_sk(sk); - return dp->dccps_timestamp_echo != 0 || + return #ifdef CONFIG_IP_DCCP_ACKVEC (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || diff --git a/net/dccp/options.c b/net/dccp/options.c index e1c94e2b8be0..9fe0510f4022 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -163,6 +163,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_role(sk), ntohl(opt_val), (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq); + /* schedule an Ack in case this sender is quiescent */ + inet_csk_schedule_ack(sk); break; case DCCPO_TIMESTAMP_ECHO: if (len != 4 && len != 6 && len != 8) From bfbddd085a5bced6efb9e1bc4d029438f9639784 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 063/105] dccp: Fix the adjustments to AWL and SWL This fixes a problem and a potential loophole with regard to seqno/ackno validity: the problem is that the initial adjustments to AWL/SWL were only performed at the begin of the connection, during the handshake. Since the Sequence Window feature is always greater than Wmin=32 (7.5.2), it is however necessary to perform these adjustments at least for the first W/W' (variables as per 7.5.1) packets in the lifetime of a connection. This requirement is complicated by the fact that W/W' can change at any time during the lifetime of a connection. Therefore the consequence is to perform this safety check each time SWL/AWL are updated. A second problem solved by this patch is that the remote/local Sequence Window feature values (which set the bounds for AWL/SWL/SWH) are undefined until the feature negotiation has completed. During the initial handshake we have more stringent sequence number protection, the changes added by this patch effect that {A,S}W{L,H} are within the correct bounds at the instant that feature negotiation completes (since the SeqWin feature activation handlers call dccp_update_gsr/gss()). A detailed rationale is below -- can be removed from the commit message. 1. Server sequence number checks during initial handshake --------------------------------------------------------- The server can not use the fields of the listening socket for seqno/ackno checks and thus needs to store all relevant information on a per-connection basis on the dccp_request socket. This is a size-constrained structure and has currently only ISS (dreq_iss) and ISR (dreq_isr) defined. Adding further fields (SW{L,H}, AW{L,H}) would increase the size of the struct and it is questionable whether this will have any practical gain. The currently implemented solution is as follows. * receiving first Request: dccp_v{4,6}_conn_request sets ISR := P.seqno, ISS := dccp_v{4,6}_init_sequence() * sending first Response: dccp_v{4,6}_send_response via dccp_make_response() sets P.seqno := ISS, sets P.ackno := ISR * receiving retransmitted Request: dccp_check_req() overrides ISR := P.seqno * answering retransmitted Request: dccp_make_response() sets ISS += 1, otherwise as per first Response * completing the handshake: succeeds in dccp_check_req() for the first Ack where P.ackno == ISS (P.seqno is not tested) * creating child socket: ISS, ISR are copied from the request_sock This solution will succeed whenever the server can receive the Request and the subsequent Ack in succession, without retransmissions. If there is packet loss, the client needs to retransmit until this condition succeeds; it will otherwise eventually give up. Adding further fields to the request_sock could increase the robustness a bit, in that it would make possible to let a reordered Ack (from a retransmitted Response) pass. The argument against such a solution is that if the packet loss is not persistent and an Ack gets through, why not wait for the one answering the original response: if the loss is persistent, it is probably better to not start the connection in the first place. Long story short: the present design (by Arnaldo) is simple and will likely work just as well as a more complicated solution. As a consequence, {A,S}W{L,H} are not needed until the moment the request_sock is cloned into the accept queue. At that stage feature negotiation has completed, so that the values for the local and remote Sequence Window feature (7.5.2) are known, i.e. we are now in a better position to compute {A,S}W{L,H}. 2. Client sequence number checks during initial handshake --------------------------------------------------------- Until entering PARTOPEN the client does not need the adjustments, since it constrains the Ack window to the packet it sent. * sending first Request: dccp_v{4,6}_connect() choose ISS, dccp_connect() then sets GAR := ISS (as per 8.5), dccp_transmit_skb() (with the previous bug fix) sets GSS := ISS, AWL := ISS, AWH := GSS * n-th retransmitted Request (with previous patch): dccp_retransmit_skb() via timer calls dccp_transmit_skb(), which sets GSS := ISS+n and then AWL := ISS, AWH := ISS+n * receiving any Response: dccp_rcv_request_sent_state_process() -- accepts packet if AWL <= P.ackno <= AWH; -- sets GSR = ISR = P.seqno * sending the Ack completing the handshake: dccp_send_ack() calls dccp_transmit_skb(), which sets GSS += 1 and AWL := ISS, AWH := GSS Signed-off-by: Gerrit Renker --- net/dccp/dccp.h | 20 ++++++++++++++++++++ net/dccp/input.c | 18 ++++++------------ net/dccp/minisocks.c | 30 +++++++++--------------------- 3 files changed, 35 insertions(+), 33 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index f9ed0cbd1bf3..e4d6e76ced41 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -415,6 +415,23 @@ static inline void dccp_update_gsr(struct sock *sk, u64 seq) dp->dccps_gsr = seq; /* Sequence validity window depends on remote Sequence Window (7.5.1) */ dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); + /* + * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, + * 7.5.1 we perform this check beyond the initial handshake: W/W' are + * always > 32, so for the first W/W' packets in the lifetime of a + * connection we always have to adjust SWL. + * A second reason why we are doing this is that the window depends on + * the feature-remote value of Sequence Window: nothing stops the peer + * from updating this value while we are busy adjusting SWL for the + * first W packets (we would have to count from scratch again then). + * Therefore it is safer to always make sure that the Sequence Window + * is not artificially extended by a peer who grows SWL downwards by + * continually updating the feature-remote Sequence-Window. + * If sequence numbers wrap it is bad luck. But that will take a while + * (48 bit), and this measure prevents Sequence-number attacks. + */ + if (before48(dp->dccps_swl, dp->dccps_isr)) + dp->dccps_swl = dp->dccps_isr; dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); } @@ -425,6 +442,9 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq) dp->dccps_gss = seq; /* Ack validity window depends on local Sequence Window value (7.5.1) */ dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); + /* Adjust AWL so that it is not below ISS - see comment above for SWL */ + if (before48(dp->dccps_awl, dp->dccps_iss)) + dp->dccps_awl = dp->dccps_iss; dp->dccps_awh = dp->dccps_gss; } diff --git a/net/dccp/input.c b/net/dccp/input.c index 5eb443f656c1..e3f43d55e3ce 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -440,20 +440,14 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; - dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; - dccp_update_gsr(sk, dp->dccps_isr); /* - * SWL and AWL are initially adjusted so that they are not less than - * the initial Sequence Numbers received and sent, respectively: - * SWL := max(GSR + 1 - floor(W/4), ISR), - * AWL := max(GSS - W' + 1, ISS). - * These adjustments MUST be applied only at the beginning of the - * connection. - * - * AWL was adjusted in dccp_v4_connect -acme + * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect + * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH + * is done as part of activating the feature values below, since + * these settings depend on the local/remote Sequence Window + * features, which were undefined or not confirmed until now. */ - dccp_set_seqno(&dp->dccps_swl, - max48(dp->dccps_swl, dp->dccps_isr)); + dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 0ecb19c5e8ce..f4d9c8f60ede 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -120,30 +120,18 @@ struct sock *dccp_create_openreq_child(struct sock *sk, * * Choose S.ISS (initial seqno) or set from Init Cookies * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies + * Set S.ISR, S.GSR from packet (or Init Cookies) + * + * Setting AWL/AWH and SWL/SWH happens as part of the feature + * activation below, as these windows all depend on the local + * and remote Sequence Window feature values (7.5.2). */ - newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss; - dccp_update_gss(newsk, dreq->dreq_iss); - - newdp->dccps_isr = dreq->dreq_isr; - dccp_update_gsr(newsk, dreq->dreq_isr); + newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss; + newdp->dccps_gar = newdp->dccps_iss; + newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr; /* - * SWL and AWL are initially adjusted so that they are not less than - * the initial Sequence Numbers received and sent, respectively: - * SWL := max(GSR + 1 - floor(W/4), ISR), - * AWL := max(GSS - W' + 1, ISS). - * These adjustments MUST be applied only at the beginning of the - * connection. - */ - dccp_set_seqno(&newdp->dccps_swl, - max48(newdp->dccps_swl, newdp->dccps_isr)); - dccp_set_seqno(&newdp->dccps_awl, - max48(newdp->dccps_awl, newdp->dccps_iss)); - - /* - * Activate features after initialising the sequence numbers, - * since CCID initialisation may depend on GSS, ISR, ISS etc. + * Activate features: initialise CCIDs, sequence windows etc. */ if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { /* It is still raw copy of parent, so invalidate From a9c1656ab10480cc6f6d34f193bcde2729fe8037 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 064/105] dccp: Merge now-reduced connect_init() function After moving the assignment of GAR/ISS from dccp_connect_init() to dccp_transmit_skb(), the former function becomes very small, so that a merger with dccp_connect() suggests itself. Signed-off-by: Gerrit Renker --- net/dccp/output.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/net/dccp/output.c b/net/dccp/output.c index be65bc30e201..1b3168307586 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -471,8 +471,9 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) /* * Do all connect socket setups that can be done AF independent. */ -static inline void dccp_connect_init(struct sock *sk) +int dccp_connect(struct sock *sk) { + struct sk_buff *skb; struct dccp_sock *dp = dccp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -482,22 +483,12 @@ static inline void dccp_connect_init(struct sock *sk) dccp_sync_mss(sk, dst_mtu(dst)); - /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ - dp->dccps_gar = dp->dccps_iss; - - icsk->icsk_retransmits = 0; -} - -int dccp_connect(struct sock *sk) -{ - struct sk_buff *skb; - struct inet_connection_sock *icsk = inet_csk(sk); - /* do not connect if feature negotiation setup fails */ if (dccp_feat_finalise_settings(dccp_sk(sk))) return -EPROTO; - dccp_connect_init(sk); + /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ + dp->dccps_gar = dp->dccps_iss; skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); if (unlikely(skb == NULL)) @@ -513,6 +504,7 @@ int dccp_connect(struct sock *sk) DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); /* Timer for repeating the REQUEST until an answer. */ + icsk->icsk_retransmits = 0; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, DCCP_RTO_MAX); return 0; From b8c6bcee1dbc1aadcd67af998e414e73fa166a7d Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 065/105] dccp: Reduce noise in output and convert to ktime_t This fixes the problem that dccp_probe output can grow quite large without apparent benefit (many identical data points), creating huge files (up to over one Gigabyte for a few minutes' test run) which are very hard to post-process (in one instance it got so bad that gnuplot ate up all memory plus swap). The cause for the problem is that the kprobe is inserted into dccp_sendmsg(), which can be called in a polling-mode (whenever the TX queue is full due to congestion-control issues, EAGAIN is returned). This creates many very similar data points, i.e. the increase of processing time does not increase the quality/information of the probe output. The fix is to attach the probe to a different function -- write_xmit was chosen since it gets called continually (both via userspace and timer); an input-path function would stop sampling as soon as the other end stops sending feedback. For comparison the output file sizes for the same 20 second test run over a lossy link: * before / without patch: 118 Megabytes * after / with patch: 1.2 Megabytes and there was much less noise in the output. To allow backward compatibility with scripts that people use, the now-unused `size' field in the output has been replaced with the CCID identifier. This also serves for future compatibility - support for CCID2 is work in progress (depends on the still unfinished SRTT/RTTVAR updates). While at it, the update to ktime_t was also performed. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald --- net/dccp/probe.c | 66 ++++++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 41 deletions(-) diff --git a/net/dccp/probe.c b/net/dccp/probe.c index a87fd4fc1b3d..eaa59d82ab0f 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c @@ -46,70 +46,54 @@ static struct { struct kfifo *fifo; spinlock_t lock; wait_queue_head_t wait; - struct timespec tstart; + ktime_t start; } dccpw; -static void printl(const char *fmt, ...) -{ - va_list args; - int len; - struct timespec now; - char tbuf[256]; - - va_start(args, fmt); - getnstimeofday(&now); - - now = timespec_sub(now, dccpw.tstart); - - len = sprintf(tbuf, "%lu.%06lu ", - (unsigned long) now.tv_sec, - (unsigned long) now.tv_nsec / NSEC_PER_USEC); - len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); - va_end(args); - - kfifo_put(dccpw.fifo, tbuf, len); - wake_up(&dccpw.wait); -} - -static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t size) +static void jdccp_write_xmit(struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); struct ccid3_hc_tx_sock *hctx = NULL; + struct timespec tv; + char buf[256]; + int len, ccid = ccid_get_current_tx_ccid(dccp_sk(sk)); - if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) + if (ccid == DCCPC_CCID3) hctx = ccid3_hc_tx_sk(sk); - if (port == 0 || ntohs(inet->dport) == port || - ntohs(inet->sport) == port) { + if (!port || ntohs(inet->dport) == port || ntohs(inet->sport) == port) { + + tv = ktime_to_timespec(ktime_sub(ktime_get(), dccpw.start)); + len = sprintf(buf, "%lu.%09lu %d.%d.%d.%d:%u %d.%d.%d.%d:%u %d", + (unsigned long)tv.tv_sec, + (unsigned long)tv.tv_nsec, + NIPQUAD(inet->saddr), ntohs(inet->sport), + NIPQUAD(inet->daddr), ntohs(inet->dport), ccid); + if (hctx) - printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %u " - "%llu %llu %d\n", - NIPQUAD(inet->saddr), ntohs(inet->sport), - NIPQUAD(inet->daddr), ntohs(inet->dport), size, + len += sprintf(buf + len, " %d %d %d %u %u %u %d", hctx->s, hctx->rtt, hctx->p, hctx->x_calc, - hctx->x_recv >> 6, hctx->x >> 6, hctx->t_ipi); - else - printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n", - NIPQUAD(inet->saddr), ntohs(inet->sport), - NIPQUAD(inet->daddr), ntohs(inet->dport), size); + (unsigned)(hctx->x_recv >> 6), + (unsigned)(hctx->x >> 6), hctx->t_ipi); + + len += sprintf(buf + len, "\n"); + kfifo_put(dccpw.fifo, buf, len); + wake_up(&dccpw.wait); } jprobe_return(); - return 0; } static struct jprobe dccp_send_probe = { .kp = { - .symbol_name = "dccp_sendmsg", + .symbol_name = "dccp_write_xmit", }, - .entry = jdccp_sendmsg, + .entry = jdccp_write_xmit, }; static int dccpprobe_open(struct inode *inode, struct file *file) { kfifo_reset(dccpw.fifo); - getnstimeofday(&dccpw.tstart); + dccpw.start = ktime_get(); return 0; } From ff49e27089ec363b7fc3849504e0435d447ab18a Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 066/105] dccp ccid-2: Ack Vector interface clean-up This patch brings the Ack Vector interface up to date. Its main purpose is to lay the basis for the subsequent patches of this set, which will use the new data structure fields and routines. There are no real algorithmic changes, rather an adaptation: (1) Replaced the static Ack Vector size (2) with a #define so that it can be adapted (with low loss / Ack Ratio, a value of 1 works, so 2 seems to be sufficient for the moment) and added a solution so that computing the ECN nonce will continue to work - even with larger Ack Vectors. (2) Replaced the #defines for Ack Vector states with a complete enum. (3) Replaced #defines to compute Ack Vector length and state with general purpose routines (inlines), and updated code to use these. (4) Added a `tail' field (conversion to circular buffer in subsequent patch). (5) Updated the (outdated) documentation for Ack Vector struct. (6) All sequence number containers now trimmed to 48 bits. (7) Removal of unused bits: * removed dccpav_ack_nonce from struct dccp_ackvec, since this is already redundantly stored in the `dccpavr_ack_nonce' (of Ack Vector record); * removed Elapsed Time for Ack Vectors (it was nowhere used); * replaced semantics of dccpavr_sent_len with dccpavr_ack_runlen, since the code needs to be able to remember the old run length; * reduced the de-/allocation routines (redundant / duplicate tests). Justification for removing Elapsed Time information [can be removed]: --------------------------------------------------------------------- 1. The Elapsed Time information for Ack Vectors was nowhere used in the code. 2. DCCP does not implement rate-based pacing of acknowledgments. The only recommendation for always including Elapsed Time is in section 11.3 of RFC 4340: "Receivers that rate-pace acknowledgements SHOULD [...] include Elapsed Time options". But such is not the case here. 3. It does not really improve estimation accuracy. The Elapsed Time field only records the time between the arrival of the last acknowledgeable packet and the time the Ack Vector is sent out. Since Linux does not (yet) implement delayed Acks, the time difference will typically be small, since often the arrival of a data packet triggers sending feedback at the HC-receiver. Justification for changes in de-/allocation routines [can be removed]: ---------------------------------------------------------------------- * INIT_LIST_HEAD in dccp_ackvec_record_new was redundant, since the list pointers were later overwritten when the node was added via list_add(); * dccp_ackvec_record_new() was called in a single place only; * calls to list_del_init() before calling dccp_ackvec_record_delete() were redundant, since subsequently the entire element was k-freed; * since all calls to dccp_ackvec_record_delete() were preceded to a call to list_del_init(), the WARN_ON test would never evaluate to true; * since all calls to dccp_ackvec_record_delete() were made from within list_for_each_entry_safe(), the test for avr == NULL was redundant; * list_empty() in ackvec_free was redundant, since the same condition is embedded in the loop condition of the subsequent list_for_each_entry_safe(). Signed-off-by: Gerrit Renker --- net/dccp/ackvec.c | 180 +++++++++++++++-------------------------- net/dccp/ackvec.h | 103 ++++++++++++----------- net/dccp/ccids/ccid2.c | 13 ++- net/dccp/input.c | 6 +- 4 files changed, 128 insertions(+), 174 deletions(-) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 01e4d39fa232..85ad70cfba5b 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -1,7 +1,8 @@ /* * net/dccp/ackvec.c * - * An implementation of the DCCP protocol + * An implementation of Ack Vectors for the DCCP protocol + * Copyright (c) 2007 University of Aberdeen, Scotland, UK * Copyright (c) 2005 Arnaldo Carvalho de Melo * * This program is free software; you can redistribute it and/or modify it @@ -23,24 +24,32 @@ static struct kmem_cache *dccp_ackvec_slab; static struct kmem_cache *dccp_ackvec_record_slab; -static struct dccp_ackvec_record *dccp_ackvec_record_new(void) +struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) { - struct dccp_ackvec_record *avr = - kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); + struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); - if (avr != NULL) - INIT_LIST_HEAD(&avr->avr_node); - - return avr; + if (av != NULL) { + av->av_buf_head = DCCPAV_MAX_ACKVEC_LEN - 1; + INIT_LIST_HEAD(&av->av_records); + } + return av; } -static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr) +static void dccp_ackvec_purge_records(struct dccp_ackvec *av) { - if (unlikely(avr == NULL)) - return; - /* Check if deleting a linked record */ - WARN_ON(!list_empty(&avr->avr_node)); - kmem_cache_free(dccp_ackvec_record_slab, avr); + struct dccp_ackvec_record *cur, *next; + + list_for_each_entry_safe(cur, next, &av->av_records, avr_node) + kmem_cache_free(dccp_ackvec_record_slab, cur); + INIT_LIST_HEAD(&av->av_records); +} + +void dccp_ackvec_free(struct dccp_ackvec *av) +{ + if (likely(av != NULL)) { + dccp_ackvec_purge_records(av); + kmem_cache_free(dccp_ackvec_slab, av); + } } static void dccp_ackvec_insert_avr(struct dccp_ackvec *av, @@ -68,24 +77,16 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; /* Figure out how many options do we need to represent the ackvec */ const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN); - u16 len = av->av_vec_len + 2 * nr_opts, i; - u32 elapsed_time; + u16 len = av->av_vec_len + 2 * nr_opts; + u8 i, nonce = 0; const unsigned char *tail, *from; unsigned char *to; struct dccp_ackvec_record *avr; - suseconds_t delta; if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) return -1; - delta = ktime_us_delta(ktime_get_real(), av->av_time); - elapsed_time = delta / 10; - - if (elapsed_time != 0 && - dccp_insert_option_elapsed_time(sk, skb, elapsed_time)) - return -1; - - avr = dccp_ackvec_record_new(); + avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); if (avr == NULL) return -1; @@ -94,7 +95,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) to = skb_push(skb, len); len = av->av_vec_len; from = av->av_buf + av->av_buf_head; - tail = av->av_buf + DCCP_MAX_ACKVEC_LEN; + tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; for (i = 0; i < nr_opts; ++i) { int copylen = len; @@ -102,7 +103,13 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) if (len > DCCP_SINGLE_OPT_MAXLEN) copylen = DCCP_SINGLE_OPT_MAXLEN; - *to++ = DCCPO_ACK_VECTOR_0; + /* + * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via + * its type; ack_nonce is the sum of all individual buf_nonce's. + */ + nonce ^= av->av_buf_nonce[i]; + + *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; *to++ = copylen + 2; /* Check if buf_head wraps */ @@ -123,75 +130,24 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) } /* - * From RFC 4340, A.2: - * - * For each acknowledgement it sends, the HC-Receiver will add an - * acknowledgement record. ack_seqno will equal the HC-Receiver - * sequence number it used for the ack packet; ack_ptr will equal - * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will - * equal buf_nonce. + * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. */ - avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; - avr->avr_ack_ptr = av->av_buf_head; - avr->avr_ack_ackno = av->av_buf_ackno; - avr->avr_ack_nonce = av->av_buf_nonce; - avr->avr_sent_len = av->av_vec_len; + avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + avr->avr_ack_ptr = av->av_buf_head; + avr->avr_ack_ackno = av->av_buf_ackno; + avr->avr_ack_nonce = nonce; + avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); dccp_ackvec_insert_avr(av, avr); dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, " "ack_ackno=%llu\n", - dccp_role(sk), avr->avr_sent_len, + dccp_role(sk), avr->avr_ack_runlen, (unsigned long long)avr->avr_ack_seqno, (unsigned long long)avr->avr_ack_ackno); return 0; } -struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) -{ - struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority); - - if (av != NULL) { - av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1; - av->av_buf_ackno = UINT48_MAX + 1; - av->av_buf_nonce = 0; - av->av_time = ktime_set(0, 0); - av->av_vec_len = 0; - INIT_LIST_HEAD(&av->av_records); - } - - return av; -} - -void dccp_ackvec_free(struct dccp_ackvec *av) -{ - if (unlikely(av == NULL)) - return; - - if (!list_empty(&av->av_records)) { - struct dccp_ackvec_record *avr, *next; - - list_for_each_entry_safe(avr, next, &av->av_records, avr_node) { - list_del_init(&avr->avr_node); - dccp_ackvec_record_delete(avr); - } - } - - kmem_cache_free(dccp_ackvec_slab, av); -} - -static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, - const u32 index) -{ - return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK; -} - -static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, - const u32 index) -{ - return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK; -} - /* * If several packets are missing, the HC-Receiver may prefer to enter multiple * bytes with run length 0, rather than a single byte with a larger run length; @@ -204,7 +160,7 @@ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, unsigned int gap; long new_head; - if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN) + if (av->av_vec_len + packets > DCCPAV_MAX_ACKVEC_LEN) return -ENOBUFS; gap = packets - 1; @@ -212,18 +168,18 @@ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, if (new_head < 0) { if (gap > 0) { - memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED, + memset(av->av_buf, DCCPAV_NOT_RECEIVED, gap + new_head + 1); gap = -new_head; } - new_head += DCCP_MAX_ACKVEC_LEN; + new_head += DCCPAV_MAX_ACKVEC_LEN; } av->av_buf_head = new_head; if (gap > 0) memset(av->av_buf + av->av_buf_head + 1, - DCCP_ACKVEC_STATE_NOT_RECEIVED, gap); + DCCPAV_NOT_RECEIVED, gap); av->av_buf[av->av_buf_head] = state; av->av_vec_len += packets; @@ -236,6 +192,8 @@ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, const u64 ackno, const u8 state) { + u8 *cur_head = av->av_buf + av->av_buf_head, + *buf_end = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; /* * Check at the right places if the buffer is full, if it is, tell the * caller to start dropping packets till the HC-Sender acks our ACK @@ -260,7 +218,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, /* See if this is the first ackno being inserted */ if (av->av_vec_len == 0) { - av->av_buf[av->av_buf_head] = state; + *cur_head = state; av->av_vec_len = 1; } else if (after48(ackno, av->av_buf_ackno)) { const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno); @@ -269,10 +227,9 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, * Look if the state of this packet is the same as the * previous ackno and if so if we can bump the head len. */ - if (delta == 1 && - dccp_ackvec_state(av, av->av_buf_head) == state && - dccp_ackvec_len(av, av->av_buf_head) < DCCP_ACKVEC_LEN_MASK) - av->av_buf[av->av_buf_head]++; + if (delta == 1 && dccp_ackvec_state(cur_head) == state && + dccp_ackvec_runlen(cur_head) < DCCPAV_MAX_RUNLEN) + *cur_head += 1; else if (dccp_ackvec_set_buf_head_state(av, delta, state)) return -ENOBUFS; } else { @@ -285,21 +242,17 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, * could reduce the complexity of this scan.) */ u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno); - u32 index = av->av_buf_head; while (1) { - const u8 len = dccp_ackvec_len(av, index); - const u8 av_state = dccp_ackvec_state(av, index); + const u8 len = dccp_ackvec_runlen(cur_head); /* * valid packets not yet in av_buf have a reserved * entry, with a len equal to 0. */ - if (av_state == DCCP_ACKVEC_STATE_NOT_RECEIVED && - len == 0 && delta == 0) { /* Found our - reserved seat! */ + if (*cur_head == DCCPAV_NOT_RECEIVED && delta == 0) { dccp_pr_debug("Found %llu reserved seat!\n", (unsigned long long)ackno); - av->av_buf[index] = state; + *cur_head = state; goto out; } /* len == 0 means one packet */ @@ -307,13 +260,12 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, goto out_duplicate; delta -= len + 1; - if (++index == DCCP_MAX_ACKVEC_LEN) - index = 0; + if (++cur_head == buf_end) + cur_head = av->av_buf; } } av->av_buf_ackno = ackno; - av->av_time = ktime_get_real(); out: return 0; @@ -333,13 +285,13 @@ static void dccp_ackvec_throw_record(struct dccp_ackvec *av, if (av->av_buf_head <= avr->avr_ack_ptr) av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head; else - av->av_vec_len = DCCP_MAX_ACKVEC_LEN - 1 - + av->av_vec_len = DCCPAV_MAX_ACKVEC_LEN - 1 - av->av_buf_head + avr->avr_ack_ptr; /* free records */ list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { - list_del_init(&avr->avr_node); - dccp_ackvec_record_delete(avr); + list_del(&avr->avr_node); + kmem_cache_free(dccp_ackvec_record_slab, avr); } } @@ -357,7 +309,7 @@ void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, if (ackno == avr->avr_ack_seqno) { dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, " "ack_ackno=%llu, ACKED!\n", - dccp_role(sk), 1, + dccp_role(sk), avr->avr_ack_runlen, (unsigned long long)avr->avr_ack_seqno, (unsigned long long)avr->avr_ack_ackno); dccp_ackvec_throw_record(av, avr); @@ -387,7 +339,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, */ avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node); while (i--) { - const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; + const u8 rl = dccp_ackvec_runlen(vector); u64 ackno_end_rl; dccp_set_seqno(&ackno_end_rl, *ackno - rl); @@ -404,8 +356,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, break; found: if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) { - const u8 state = *vector & DCCP_ACKVEC_STATE_MASK; - if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) { + if (dccp_ackvec_state(vector) != DCCPAV_NOT_RECEIVED) { dccp_pr_debug("%s ACK vector 0, len=%d, " "ack_seqno=%llu, ack_ackno=%llu, " "ACKED!\n", @@ -448,10 +399,9 @@ int __init dccp_ackvec_init(void) if (dccp_ackvec_slab == NULL) goto out_err; - dccp_ackvec_record_slab = - kmem_cache_create("dccp_ackvec_record", - sizeof(struct dccp_ackvec_record), - 0, SLAB_HWCACHE_ALIGN, NULL); + dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record", + sizeof(struct dccp_ackvec_record), + 0, SLAB_HWCACHE_ALIGN, NULL); if (dccp_ackvec_record_slab == NULL) goto out_destroy_slab; diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index 1c10814fdf72..df18f9030dbb 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -3,9 +3,9 @@ /* * net/dccp/ackvec.h * - * An implementation of the DCCP protocol + * An implementation of Ack Vectors for the DCCP protocol + * Copyright (c) 2007 University of Aberdeen, Scotland, UK * Copyright (c) 2005 Arnaldo Carvalho de Melo - * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -13,75 +13,84 @@ #include #include -#include #include #include -/* We can spread an ack vector across multiple options */ -#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2) +/* + * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN, + * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 + * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives + * more headroom if Ack Ratio is higher or when the sender acknowledges slowly. + */ +#define DCCPAV_NUM_ACKVECS 2 +#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS) /* Estimated minimum average Ack Vector length - used for updating MPS */ #define DCCPAV_MIN_OPTLEN 16 -#define DCCP_ACKVEC_STATE_RECEIVED 0 -#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) -#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6) +enum dccp_ackvec_states { + DCCPAV_RECEIVED = 0x00, + DCCPAV_ECN_MARKED = 0x40, + DCCPAV_RESERVED = 0x80, + DCCPAV_NOT_RECEIVED = 0xC0 +}; +#define DCCPAV_MAX_RUNLEN 0x3F -#define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */ -#define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */ +static inline u8 dccp_ackvec_runlen(const u8 *cell) +{ + return *cell & DCCPAV_MAX_RUNLEN; +} -/** struct dccp_ackvec - ack vector +static inline u8 dccp_ackvec_state(const u8 *cell) +{ + return *cell & ~DCCPAV_MAX_RUNLEN; +} + +/** struct dccp_ackvec - Ack Vector main data structure * - * This data structure is the one defined in RFC 4340, Appendix A. + * This implements a fixed-size circular buffer within an array and is largely + * based on Appendix A of RFC 4340. * - * @av_buf_head - circular buffer head - * @av_buf_tail - circular buffer tail - * @av_buf_ackno - ack # of the most recent packet acknowledgeable in the - * buffer (i.e. %av_buf_head) - * @av_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked - * by the buffer with State 0 - * - * Additionally, the HC-Receiver must keep some information about the - * Ack Vectors it has recently sent. For each packet sent carrying an - * Ack Vector, it remembers four variables: - * - * @av_records - list of dccp_ackvec_record - * @av_ack_nonce - the one-bit sum of the ECN Nonces for all State 0. - * - * @av_time - the time in usecs - * @av_buf - circular buffer of acknowledgeable packets + * @av_buf: circular buffer storage area + * @av_buf_head: head index; begin of live portion in @av_buf + * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf + * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf + * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to + * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf + * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) + * @av_veclen: length of the live portion of @av_buf */ struct dccp_ackvec { - u64 av_buf_ackno; - struct list_head av_records; - ktime_t av_time; + u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; u16 av_buf_head; + u16 av_buf_tail; + u64 av_buf_ackno:48; + bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; + struct list_head av_records; u16 av_vec_len; - u8 av_buf_nonce; - u8 av_ack_nonce; - u8 av_buf[DCCP_MAX_ACKVEC_LEN]; }; -/** struct dccp_ackvec_record - ack vector record +/** struct dccp_ackvec_record - Records information about sent Ack Vectors * - * ACK vector record as defined in Appendix A of spec. + * These list entries define the additional information which the HC-Receiver + * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A. * - * The list is sorted by avr_ack_seqno + * @avr_node: the list node in @av_records + * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on + * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to + * @avr_ack_ptr: pointer into @av_buf where this record starts + * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending + * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent * - * @avr_node - node in av_records - * @avr_ack_seqno - sequence number of the packet this record was sent on - * @avr_ack_ackno - sequence number being acknowledged - * @avr_ack_ptr - pointer into av_buf where this record starts - * @avr_ack_nonce - av_ack_nonce at the time this record was sent - * @avr_sent_len - lenght of the record in av_buf + * The list as a whole is sorted in descending order by @avr_ack_seqno. */ struct dccp_ackvec_record { struct list_head avr_node; - u64 avr_ack_seqno; - u64 avr_ack_ackno; + u64 avr_ack_seqno:48; + u64 avr_ack_ackno:48; u16 avr_ack_ptr; - u16 avr_sent_len; - u8 avr_ack_nonce; + u8 avr_ack_runlen; + u8 avr_ack_nonce:1; }; struct sock; diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index f56ab68a4b78..813d5cd40e8b 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -580,8 +580,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) &vector, &veclen)) != -1) { /* go through this ack vector */ while (veclen--) { - const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK; - u64 ackno_end_rl = SUB48(ackno, rl); + u64 ackno_end_rl = SUB48(ackno, dccp_ackvec_runlen(vector)); ccid2_pr_debug("ackvec start:%llu end:%llu\n", (unsigned long long)ackno, @@ -604,17 +603,15 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * run length */ while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { - const u8 state = *vector & - DCCP_ACKVEC_STATE_MASK; + const u8 state = dccp_ackvec_state(vector); /* new packet received or marked */ - if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED && + if (state != DCCPAV_NOT_RECEIVED && !seqp->ccid2s_acked) { - if (state == - DCCP_ACKVEC_STATE_ECN_MARKED) { + if (state == DCCPAV_ECN_MARKED) ccid2_congestion_event(sk, seqp); - } else + else ccid2_new_ack(sk, seqp, &maxincr); diff --git a/net/dccp/input.c b/net/dccp/input.c index e3f43d55e3ce..70ad0ba72146 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -377,8 +377,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_ACKVEC_STATE_RECEIVED)) + DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED)) goto discard; dccp_deliver_input_to_ccids(sk, skb); @@ -627,8 +626,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_ACKVEC_STATE_RECEIVED)) + DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED)) goto discard; dccp_deliver_input_to_ccids(sk, skb); From 4829007c7bc689cbc290fc09eccbe90bd52c2a5e Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 067/105] dccp ccid-2: Separate internals of Ack Vectors from option-parsing code This patch * separates Ack Vector housekeeping code from option-insertion code; * shifts option-specific code from ackvec.c into options.c; * introduces a dedicated routine to take care of the Ack Vector records; * simplifies the dccp_ackvec_insert_avr() routine: the BUG_ON was redundant, since the list is automatically arranged in descending order of ack_seqno. Signed-off-by: Gerrit Renker --- net/dccp/ackvec.c | 100 ++++++++------------------------------------- net/dccp/ackvec.h | 5 +-- net/dccp/options.c | 60 +++++++++++++++++++++++++++ 3 files changed, 80 insertions(+), 85 deletions(-) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 85ad70cfba5b..de84dd34c93f 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -52,99 +52,35 @@ void dccp_ackvec_free(struct dccp_ackvec *av) } } -static void dccp_ackvec_insert_avr(struct dccp_ackvec *av, - struct dccp_ackvec_record *avr) +/** + * dccp_ackvec_update_records - Record information about sent Ack Vectors + * @av: Ack Vector records to update + * @seqno: Sequence number of the packet carrying the Ack Vector just sent + * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector + */ +int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) { - /* - * AVRs are sorted by seqno. Since we are sending them in order, we - * just add the AVR at the head of the list. - * -sorbo. - */ - if (!list_empty(&av->av_records)) { - const struct dccp_ackvec_record *head = - list_entry(av->av_records.next, - struct dccp_ackvec_record, - avr_node); - BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno)); - } - - list_add(&avr->avr_node, &av->av_records); -} - -int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; - /* Figure out how many options do we need to represent the ackvec */ - const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN); - u16 len = av->av_vec_len + 2 * nr_opts; - u8 i, nonce = 0; - const unsigned char *tail, *from; - unsigned char *to; struct dccp_ackvec_record *avr; - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) - return -1; - avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); if (avr == NULL) - return -1; + return -ENOBUFS; - DCCP_SKB_CB(skb)->dccpd_opt_len += len; - - to = skb_push(skb, len); - len = av->av_vec_len; - from = av->av_buf + av->av_buf_head; - tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; - - for (i = 0; i < nr_opts; ++i) { - int copylen = len; - - if (len > DCCP_SINGLE_OPT_MAXLEN) - copylen = DCCP_SINGLE_OPT_MAXLEN; - - /* - * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via - * its type; ack_nonce is the sum of all individual buf_nonce's. - */ - nonce ^= av->av_buf_nonce[i]; - - *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; - *to++ = copylen + 2; - - /* Check if buf_head wraps */ - if (from + copylen > tail) { - const u16 tailsize = tail - from; - - memcpy(to, from, tailsize); - to += tailsize; - len -= tailsize; - copylen -= tailsize; - from = av->av_buf; - } - - memcpy(to, from, copylen); - from += copylen; - to += copylen; - len -= copylen; - } - - /* - * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. - */ - avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; + avr->avr_ack_seqno = seqno; avr->avr_ack_ptr = av->av_buf_head; avr->avr_ack_ackno = av->av_buf_ackno; - avr->avr_ack_nonce = nonce; + avr->avr_ack_nonce = nonce_sum; avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); + /* + * Since GSS is incremented for each packet, the list is automatically + * arranged in descending order of @ack_seqno. + */ + list_add(&avr->avr_node, &av->av_records); - dccp_ackvec_insert_avr(av, avr); - - dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, " - "ack_ackno=%llu\n", - dccp_role(sk), avr->avr_ack_runlen, + dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", (unsigned long long)avr->avr_ack_seqno, - (unsigned long long)avr->avr_ack_ackno); + (unsigned long long)avr->avr_ack_ackno, + avr->avr_ack_runlen); return 0; } diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index df18f9030dbb..b34e5ed4c342 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -112,7 +112,7 @@ extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, u64 *ackno, const u8 opt, const u8 *value, const u8 len); -extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb); +extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) { @@ -155,8 +155,7 @@ static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, return -1; } -static inline int dccp_insert_option_ackvec(const struct sock *sk, - const struct sk_buff *skb) +static inline int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 nonce) { return -1; } diff --git a/net/dccp/options.c b/net/dccp/options.c index 9fe0510f4022..392d7db31342 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -428,6 +428,66 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, return 0; } +static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; + /* Figure out how many options do we need to represent the ackvec */ + const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN); + u16 len = av->av_vec_len + 2 * nr_opts; + u8 i, nonce = 0; + const unsigned char *tail, *from; + unsigned char *to; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) + return -1; + + DCCP_SKB_CB(skb)->dccpd_opt_len += len; + + to = skb_push(skb, len); + len = av->av_vec_len; + from = av->av_buf + av->av_buf_head; + tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; + + for (i = 0; i < nr_opts; ++i) { + int copylen = len; + + if (len > DCCP_SINGLE_OPT_MAXLEN) + copylen = DCCP_SINGLE_OPT_MAXLEN; + + /* + * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via + * its type; ack_nonce is the sum of all individual buf_nonce's. + */ + nonce ^= av->av_buf_nonce[i]; + + *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i]; + *to++ = copylen + 2; + + /* Check if buf_head wraps */ + if (from + copylen > tail) { + const u16 tailsize = tail - from; + + memcpy(to, from, tailsize); + to += tailsize; + len -= tailsize; + copylen -= tailsize; + from = av->av_buf; + } + + memcpy(to, from, copylen); + from += copylen; + to += copylen; + len -= copylen; + } + /* + * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. + */ + if (dccp_ackvec_update_records(av, DCCP_SKB_CB(skb)->dccpd_seq, nonce)) + return -ENOBUFS; + return 0; +} + /** * dccp_insert_option_mandatory - Mandatory option (5.8.2) * Note that since we are using skb_push, this function needs to be called From d7dc7e5f49299739e610ea8febf9ea91a4dc1ae9 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 068/105] dccp ccid-2: Implementation of circular Ack Vector buffer with overflow handling This completes the implementation of a circular buffer for Ack Vectors, by extending the current (linear array-based) implementation. The changes are: (a) An `overflow' flag to deal with the case of overflow. As before, dynamic growth of the buffer will not be supported; but code will be added to deal robustly with overflowing Ack Vector buffers. (b) A `tail_seqno' field. When naively implementing the algorithm of Appendix A in RFC 4340, problems arise whenever subsequent Ack Vector records overlap, which can bring the entire run length calculation completely out of synch. (This is documented on http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/\ ack_vectors/tracking_tail_ackno/ .) (c) The buffer lengthi is now computed dynamically (i.e. current fill level), as the span between head to tail. As a result, dccp_ackvec_pending() is now simpler - the #ifdef is no longer necessary since buf_empty is always true when IP_DCCP_ACKVEC is not configured. Note on overflow handling: ------------------------- The Ack Vector code previously simply started to drop packets when the Ack Vector buffer overflowed. This means that the userspace application will not be able to receive, only because of an Ack Vector storage problem. Furthermore, overflow may be transient, so that applications may later recover from the overflow. Recovering from dropped packets is more difficult (e.g. video key frames). Hence the patch uses a different policy: when the buffer overflows, the oldest entries are subsequently overwritten. This has a higher chance of recovery. Details are on http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/ack_vectors/ Signed-off-by: Gerrit Renker --- net/dccp/ackvec.c | 31 ++++++++++++++++++++++++++++++- net/dccp/ackvec.h | 17 ++++++++++++++--- net/dccp/dccp.h | 14 +++++++------- net/dccp/options.c | 10 +++++----- 4 files changed, 56 insertions(+), 16 deletions(-) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index de84dd34c93f..1184d5e5dc96 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -29,7 +29,7 @@ struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); if (av != NULL) { - av->av_buf_head = DCCPAV_MAX_ACKVEC_LEN - 1; + av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; INIT_LIST_HEAD(&av->av_records); } return av; @@ -71,6 +71,14 @@ int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) avr->avr_ack_ackno = av->av_buf_ackno; avr->avr_ack_nonce = nonce_sum; avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); + /* + * When the buffer overflows, we keep no more than one record. This is + * the simplest way of disambiguating sender-Acks dating from before the + * overflow from sender-Acks which refer to after the overflow; a simple + * solution is preferable here since we are handling an exception. + */ + if (av->av_overflow) + dccp_ackvec_purge_records(av); /* * Since GSS is incremented for each packet, the list is automatically * arranged in descending order of @ack_seqno. @@ -84,6 +92,27 @@ int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) return 0; } +/* + * Buffer index and length computation using modulo-buffersize arithmetic. + * Note that, as pointers move from right to left, head is `before' tail. + */ +static inline u16 __ackvec_idx_add(const u16 a, const u16 b) +{ + return (a + b) % DCCPAV_MAX_ACKVEC_LEN; +} + +static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) +{ + return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); +} + +u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) +{ + if (unlikely(av->av_overflow)) + return DCCPAV_MAX_ACKVEC_LEN; + return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); +} + /* * If several packets are missing, the HC-Receiver may prefer to enter multiple * bytes with run length 0, rather than a single byte with a larger run length; diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index b34e5ed4c342..92f65b0fef5b 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -21,6 +21,7 @@ * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1 * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives * more headroom if Ack Ratio is higher or when the sender acknowledges slowly. + * The maximum value is bounded by the u16 types for indices and functions. */ #define DCCPAV_NUM_ACKVECS 2 #define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS) @@ -55,8 +56,10 @@ static inline u8 dccp_ackvec_state(const u8 *cell) * @av_buf_head: head index; begin of live portion in @av_buf * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf + * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf + * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) * @av_veclen: length of the live portion of @av_buf */ @@ -65,7 +68,9 @@ struct dccp_ackvec { u16 av_buf_head; u16 av_buf_tail; u64 av_buf_ackno:48; + u64 av_tail_ackno:48; bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; + u8 av_overflow:1; struct list_head av_records; u16 av_vec_len; }; @@ -113,10 +118,11 @@ extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, const u8 *value, const u8 len); extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); +extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); -static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) +static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) { - return av->av_vec_len; + return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; } #else /* CONFIG_IP_DCCP_ACKVEC */ static inline int dccp_ackvec_init(void) @@ -160,9 +166,14 @@ static inline int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 return -1; } -static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) +static inline u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) { return 0; } + +static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) +{ + return true; +} #endif /* CONFIG_IP_DCCP_ACKVEC */ #endif /* _ACKVEC_H */ diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index e4d6e76ced41..1e65378eea3f 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -448,15 +448,15 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq) dp->dccps_awh = dp->dccps_gss; } +static inline int dccp_ackvec_pending(const struct sock *sk) +{ + return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL && + !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec); +} + static inline int dccp_ack_pending(const struct sock *sk) { - const struct dccp_sock *dp = dccp_sk(sk); - return -#ifdef CONFIG_IP_DCCP_ACKVEC - (dp->dccps_hc_rx_ackvec != NULL && - dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || -#endif - inet_csk_ack_scheduled(sk); + return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk); } extern int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val); diff --git a/net/dccp/options.c b/net/dccp/options.c index 392d7db31342..3163ae980f16 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -432,9 +432,10 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; + const u16 buflen = dccp_ackvec_buflen(av); /* Figure out how many options do we need to represent the ackvec */ - const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN); - u16 len = av->av_vec_len + 2 * nr_opts; + const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); + u16 len = buflen + 2 * nr_opts; u8 i, nonce = 0; const unsigned char *tail, *from; unsigned char *to; @@ -445,7 +446,7 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_opt_len += len; to = skb_push(skb, len); - len = av->av_vec_len; + len = buflen; from = av->av_buf + av->av_buf_head; tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; @@ -583,8 +584,7 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) if (dccp_insert_option_timestamp(sk, skb)) return -1; - } else if (dp->dccps_hc_rx_ackvec != NULL && - dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && + } else if (dccp_ackvec_pending(sk) && dccp_insert_option_ackvec(sk, skb)) { return -1; } From 68b1de15765f2b0e0925e692dab2b2fa2abd93fc Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 069/105] dccp ccid-2: Algorithm to update buffer state This provides a routine to consistently update the buffer state when the peer acknowledges receipt of Ack Vectors; updating state in the list of Ack Vectors as well as in the circular buffer. While based on RFC 4340, several additional (and necessary) precautions were added to protect the consistency of the buffer state. These additions are essential, since analysis and experience showed that the basic algorithm was insufficient for this task (which lead to problems that were hard to debug). The algorithm now * deals with HC-sender acknowledging to HC-receiver and vice versa, * keeps track of the last unacknowledged but received seqno in tail_ackno, * has special cases to reset the overflow condition when appropriate, * is protected against receiving older information (would mess up buffer state). Note: The older code performed an unnecessary step, where the sender cleared Ack Vector state by parsing the Ack Vector received by the HC-receiver. Doing this was entirely redundant, since * the receiver always puts the full acknowledgment window (groups 2,3 in 11.4.2) into the Ack Vectors it sends; hence the HC-receiver is only interested in the highest state that the HC-sender received; * this means that the acknowledgment number on the (Data)Ack from the HC-sender is sufficient; and work done in parsing earlier state is not necessary, since the later state subsumes the earlier one (see also RFC 4340, A.4). This older interface (dccp_ackvec_parse()) is therefore removed. Signed-off-by: Gerrit Renker --- net/dccp/ackvec.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++ net/dccp/ackvec.h | 6 ++++ net/dccp/input.c | 4 +-- net/dccp/options.c | 6 ++-- 4 files changed, 98 insertions(+), 6 deletions(-) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 1184d5e5dc96..f1341a617f96 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -92,6 +92,24 @@ int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) return 0; } +static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, + const u64 ackno) +{ + struct dccp_ackvec_record *avr; + /* + * Exploit that records are inserted in descending order of sequence + * number, start with the oldest record first. If @ackno is `before' + * the earliest ack_ackno, the packet is too old to be considered. + */ + list_for_each_entry_reverse(avr, av_list, avr_node) { + if (avr->avr_ack_seqno == ackno) + return avr; + if (before48(ackno, avr->avr_ack_seqno)) + break; + } + return NULL; +} + /* * Buffer index and length computation using modulo-buffersize arithmetic. * Note that, as pointers move from right to left, head is `before' tail. @@ -356,6 +374,76 @@ int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, return 0; } +/** + * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection + * This routine is called when the peer acknowledges the receipt of Ack Vectors + * up to and including @ackno. While based on on section A.3 of RFC 4340, here + * are additional precautions to prevent corrupted buffer state. In particular, + * we use tail_ackno to identify outdated records; it always marks the earliest + * packet of group (2) in 11.4.2. + */ +void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno) + { + struct dccp_ackvec_record *avr, *next; + u8 runlen_now, eff_runlen; + s64 delta; + + avr = dccp_ackvec_lookup(&av->av_records, ackno); + if (avr == NULL) + return; + /* + * Deal with outdated acknowledgments: this arises when e.g. there are + * several old records and the acks from the peer come in slowly. In + * that case we may still have records that pre-date tail_ackno. + */ + delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno); + if (delta < 0) + goto free_records; + /* + * Deal with overlapping Ack Vectors: don't subtract more than the + * number of packets between tail_ackno and ack_ackno. + */ + eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen; + + runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr); + /* + * The run length of Ack Vector cells does not decrease over time. If + * the run length is the same as at the time the Ack Vector was sent, we + * free the ack_ptr cell. That cell can however not be freed if the run + * length has increased: in this case we need to move the tail pointer + * backwards (towards higher indices), to its next-oldest neighbour. + */ + if (runlen_now > eff_runlen) { + + av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1; + av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1); + + /* This move may not have cleared the overflow flag. */ + if (av->av_overflow) + av->av_overflow = (av->av_buf_head == av->av_buf_tail); + } else { + av->av_buf_tail = avr->avr_ack_ptr; + /* + * We have made sure that avr points to a valid cell within the + * buffer. This cell is either older than head, or equals head + * (empty buffer): in both cases we no longer have any overflow. + */ + av->av_overflow = 0; + } + + /* + * The peer has acknowledged up to and including ack_ackno. Hence the + * first packet in group (2) of 11.4.2 is the successor of ack_ackno. + */ + av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1); + +free_records: + list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { + list_del(&avr->avr_node); + kmem_cache_free(dccp_ackvec_record_slab, avr); + } +} + int __init dccp_ackvec_init(void) { dccp_ackvec_slab = kmem_cache_create("dccp_ackvec", diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index 92f65b0fef5b..b757e9b4110f 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -118,6 +118,7 @@ extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, const u8 *value, const u8 len); extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); +extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) @@ -149,6 +150,11 @@ static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, return -1; } +static inline void dccp_ackvec_clear_state(struct dccp_ackvec *av, + const u64 ackno) +{ +} + static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, const u64 ackno) { diff --git a/net/dccp/input.c b/net/dccp/input.c index 70ad0ba72146..77a5d57ab702 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -164,8 +164,8 @@ static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) struct dccp_sock *dp = dccp_sk(sk); if (dp->dccps_hc_rx_ackvec != NULL) - dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_ack_seq); + dccp_ackvec_clear_state(dp->dccps_hc_rx_ackvec, + DCCP_SKB_CB(skb)->dccpd_ack_seq); } static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) diff --git a/net/dccp/options.c b/net/dccp/options.c index 3163ae980f16..b11d7b7167f0 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -54,7 +54,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, struct dccp_sock *dp = dccp_sk(sk); const struct dccp_hdr *dh = dccp_hdr(skb); const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type; - u64 ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); unsigned char *opt_ptr = options; const unsigned char *opt_end = (unsigned char *)dh + @@ -133,9 +132,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, case DCCPO_ACK_VECTOR_1: if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ break; - if (dp->dccps_hc_rx_ackvec != NULL && - dccp_ackvec_parse(sk, skb, &ackno, opt, value, len)) - goto out_invalid_option; + dccp_pr_debug("%s Ack Vector (len=%u)\n", dccp_role(sk), + len); break; case DCCPO_TIMESTAMP: if (len != 4) From e28fe59f9c82ef55fc9b55e745531c9fed86f00a Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 070/105] dccp ccid-2: Update code for the Ack Vector input/registration routine This patch uupdates the code which registers new packets as received, using the new circular buffer interface. It contributes a new algorithm which * supports both tail/head pointers and buffer wrap-around and * deals with overflow (head/tail move in lock-step). The updated code is also partioned differently, into 1. dealing with the empty buffer, 2. adding new packets into non-empty buffer, 3. reserving space when encountering a `hole' in the sequence space, 4. updating old state and deciding when old state is irrelevant. Protection against large burst losses: With regard to (3), it is too costly to reserve space when there are large bursts of losses. When bursts get too large, the code does no longer reserve space and just fills in cells normally. This measure reduces space consumption by a factor of 63. The code reuses in part the previous implementation by Arnaldo de Melo. Signed-off-by: Gerrit Renker --- net/dccp/ackvec.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++ net/dccp/ackvec.h | 9 +++ 2 files changed, 159 insertions(+) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index f1341a617f96..bf9cb7d7549d 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -131,6 +131,156 @@ u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); } +/** + * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 + * @av: non-empty buffer to update + * @distance: negative or zero distance of @seqno from buf_ackno downward + * @seqno: the (old) sequence number whose record is to be updated + * @state: state in which packet carrying @seqno was received + */ +static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, + u64 seqno, enum dccp_ackvec_states state) +{ + u16 ptr = av->av_buf_head; + + BUG_ON(distance > 0); + if (unlikely(dccp_ackvec_is_empty(av))) + return; + + do { + u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); + + if (distance + runlen >= 0) { + /* + * Only update the state if packet has not been received + * yet. This is OK as per the second table in RFC 4340, + * 11.4.1; i.e. here we are using the following table: + * RECEIVED + * 0 1 3 + * S +---+---+---+ + * T 0 | 0 | 0 | 0 | + * O +---+---+---+ + * R 1 | 1 | 1 | 1 | + * E +---+---+---+ + * D 3 | 0 | 1 | 3 | + * +---+---+---+ + * The "Not Received" state was set by reserve_seats(). + */ + if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) + av->av_buf[ptr] = state; + else + dccp_pr_debug("Not changing %llu state to %u\n", + (unsigned long long)seqno, state); + break; + } + + distance += runlen + 1; + ptr = __ackvec_idx_add(ptr, 1); + + } while (ptr != av->av_buf_tail); +} + +/* Mark @num entries after buf_head as "Not yet received". */ +static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) +{ + u16 start = __ackvec_idx_add(av->av_buf_head, 1), + len = DCCPAV_MAX_ACKVEC_LEN - start; + + /* check for buffer wrap-around */ + if (num > len) { + memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); + start = 0; + num -= len; + } + if (num) + memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); +} + +/** + * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer + * @av: container of buffer to update (can be empty or non-empty) + * @num_packets: number of packets to register (must be >= 1) + * @seqno: sequence number of the first packet in @num_packets + * @state: state in which packet carrying @seqno was received + */ +static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, + u64 seqno, enum dccp_ackvec_states state) +{ + u32 num_cells = num_packets; + + if (num_packets > DCCPAV_BURST_THRESH) { + u32 lost_packets = num_packets - 1; + + DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets); + /* + * We received 1 packet and have a loss of size "num_packets-1" + * which we squeeze into num_cells-1 rather than reserving an + * entire byte for each lost packet. + * The reason is that the vector grows in O(burst_length); when + * it grows too large there will no room left for the payload. + * This is a trade-off: if a few packets out of the burst show + * up later, their state will not be changed; it is simply too + * costly to reshuffle/reallocate/copy the buffer each time. + * Should such problems persist, we will need to switch to a + * different underlying data structure. + */ + for (num_packets = num_cells = 1; lost_packets; ++num_cells) { + u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN); + + av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1); + av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len; + + lost_packets -= len; + } + } + + if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) { + DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n"); + av->av_overflow = true; + } + + av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets); + if (av->av_overflow) + av->av_buf_tail = av->av_buf_head; + + av->av_buf[av->av_buf_head] = state; + av->av_buf_ackno = seqno; + + if (num_packets > 1) + dccp_ackvec_reserve_seats(av, num_packets - 1); +} + +/** + * dccp_ackvec_input - Register incoming packet in the buffer + */ +void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) +{ + u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq; + enum dccp_ackvec_states state = DCCPAV_RECEIVED; + + if (dccp_ackvec_is_empty(av)) { + dccp_ackvec_add_new(av, 1, seqno, state); + av->av_tail_ackno = seqno; + + } else { + s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno); + u8 *current_head = av->av_buf + av->av_buf_head; + + if (num_packets == 1 && + dccp_ackvec_state(current_head) == state && + dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) { + + *current_head += 1; + av->av_buf_ackno = seqno; + + } else if (num_packets > 0) { + dccp_ackvec_add_new(av, num_packets, seqno, state); + } else { + dccp_ackvec_update_old(av, num_packets, seqno, state); + } + } +} + /* * If several packets are missing, the HC-Receiver may prefer to enter multiple * bytes with run length 0, rather than a single byte with a larger run length; diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index b757e9b4110f..36ca2e9e5c84 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -29,6 +29,9 @@ /* Estimated minimum average Ack Vector length - used for updating MPS */ #define DCCPAV_MIN_OPTLEN 16 +/* Threshold for coping with large bursts of losses */ +#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8) + enum dccp_ackvec_states { DCCPAV_RECEIVED = 0x00, DCCPAV_ECN_MARKED = 0x40, @@ -117,6 +120,7 @@ extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, u64 *ackno, const u8 opt, const u8 *value, const u8 len); +extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); extern u16 dccp_ackvec_buflen(const struct dccp_ackvec *av); @@ -144,6 +148,11 @@ static inline void dccp_ackvec_free(struct dccp_ackvec *av) { } +static inline void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) +{ + +} + static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, const u64 ackno, const u8 state) { From 283fb4a5f39d1521d53e1044bff0ba2654acf145 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 071/105] dccp ccid-2: Consolidate Ack-Vector processing within main DCCP module This aggregates Ack Vector processing (handling input and clearing old state) into one function, for the following reasons and benefits: * all Ack Vector-specific processing is now in one place; * duplicated code is removed; * ensuring sanity: from an Ack Vector point of view, it is better to clear the old state first before entering new state; * Ack Event handling happens mostly within the CCIDs, not the main DCCP module. Signed-off-by: Gerrit Renker --- net/dccp/input.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/net/dccp/input.c b/net/dccp/input.c index 77a5d57ab702..9a108ce17fc7 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -159,13 +159,15 @@ static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb) dccp_time_wait(sk, DCCP_TIME_WAIT, 0); } -static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) +static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb) { - struct dccp_sock *dp = dccp_sk(sk); + struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec; - if (dp->dccps_hc_rx_ackvec != NULL) - dccp_ackvec_clear_state(dp->dccps_hc_rx_ackvec, - DCCP_SKB_CB(skb)->dccpd_ack_seq); + if (av == NULL) + return; + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq); + dccp_ackvec_input(av, skb); } static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb) @@ -364,21 +366,13 @@ discard: int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned len) { - struct dccp_sock *dp = dccp_sk(sk); - if (dccp_check_seqno(sk, skb)) goto discard; if (dccp_parse_options(sk, NULL, skb)) return 1; - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_event_ack_recv(sk, skb); - - if (dp->dccps_hc_rx_ackvec != NULL && - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED)) - goto discard; + dccp_handle_ackvec_processing(sk, skb); dccp_deliver_input_to_ccids(sk, skb); return __dccp_rcv_established(sk, skb, dh, len); @@ -621,14 +615,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (dccp_parse_options(sk, NULL, skb)) return 1; - if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_event_ack_recv(sk, skb); - - if (dp->dccps_hc_rx_ackvec != NULL && - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, DCCPAV_RECEIVED)) - goto discard; - + dccp_handle_ackvec_processing(sk, skb); dccp_deliver_input_to_ccids(sk, skb); } From c2f42077bd06f300ae959204f3c007f820f5e769 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 072/105] dccp ccid-2: Schedule Sync as out-of-band mechanism The problem with Ack Vectors is that i) their length is variable and can in principle grow quite large, ii) it is hard to predict exactly how large they will be. Due to the second point it seems not a good idea to reduce the MPS; in particular when on average there is enough room for the Ack Vector and an increase in length is momentarily due to some burst loss, after which the Ack Vector returns to its normal/average length. The solution taken by this patch is to subtract a minimum-expected Ack Vector length from the MPS (previous patch), and to defer any larger Ack Vectors onto a separate Sync - but only if indeed there is no space left on the skb. This patch provides the infrastructure to schedule Sync-packets for transporting (urgent) out-of-band data. Its signalling is quicker than scheduling an Ack, since it does not need to wait for new application data. It can thus serve other parts of the DCCP code as well. Signed-off-by: Gerrit Renker --- include/linux/dccp.h | 2 ++ net/dccp/options.c | 24 ++++++++++++++++++++---- net/dccp/output.c | 8 ++++++++ 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 7187bd8a75f6..83197b601a4f 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -462,6 +462,7 @@ struct dccp_ackvec; * @dccps_hc_rx_insert_options - receiver wants to add options when acking * @dccps_hc_tx_insert_options - sender wants to add options when sending * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) + * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" * @dccps_xmit_timer - timer for when CCID is not ready to send * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) */ @@ -502,6 +503,7 @@ struct dccp_sock { __u8 dccps_hc_rx_insert_options:1; __u8 dccps_hc_tx_insert_options:1; __u8 dccps_server_timewait:1; + __u8 dccps_sync_scheduled:1; struct timer_list dccps_xmit_timer; }; diff --git a/net/dccp/options.c b/net/dccp/options.c index b11d7b7167f0..791e07853a79 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -430,6 +430,7 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); const u16 buflen = dccp_ackvec_buflen(av); /* Figure out how many options do we need to represent the ackvec */ const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN); @@ -438,10 +439,25 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) const unsigned char *tail, *from; unsigned char *to; - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) + if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) { + DCCP_WARN("Lacking space for %u bytes on %s packet\n", len, + dccp_packet_name(dcb->dccpd_type)); return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len; + } + /* + * Since Ack Vectors are variable-length, we can not always predict + * their size. To catch exception cases where the space is running out + * on the skb, a separate Sync is scheduled to carry the Ack Vector. + */ + if (len > DCCPAV_MIN_OPTLEN && + len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) { + DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), " + "MPS=%u ==> reduce payload size?\n", len, skb->len, + dcb->dccpd_opt_len, dp->dccps_mss_cache); + dp->dccps_sync_scheduled = 1; + return 0; + } + dcb->dccpd_opt_len += len; to = skb_push(skb, len); len = buflen; @@ -482,7 +498,7 @@ static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) /* * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340. */ - if (dccp_ackvec_update_records(av, DCCP_SKB_CB(skb)->dccpd_seq, nonce)) + if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce)) return -ENOBUFS; return 0; } diff --git a/net/dccp/output.c b/net/dccp/output.c index 1b3168307586..bfda071559f4 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -305,6 +305,8 @@ void dccp_write_xmit(struct sock *sk, int block) if (err) DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", err); + if (dp->dccps_sync_scheduled) + dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); } else { dccp_pr_debug("packet discarded due to err=%d\n", err); kfree_skb(skb); @@ -591,6 +593,12 @@ void dccp_send_sync(struct sock *sk, const u64 ackno, DCCP_SKB_CB(skb)->dccpd_type = pkt_type; DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno; + /* + * Clear the flag in case the Sync was scheduled for out-of-band data, + * such as carrying a long Ack Vector. + */ + dccp_sk(sk)->dccps_sync_scheduled = 0; + dccp_transmit_skb(sk, skb); } From 5a577b488f687f339dea62e7bb4f4c5793ad523f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 073/105] dccp ccid-2: Remove old infrastructure This removes * functions for which updates have been provided in the preceding patches and * the @av_vec_len field - it is no longer necessary since the buffer length is now always computed dynamically; * conditional debugging code (CONFIG_IP_DCCP_ACKVEC). The reason for removing the conditional debugging code is that Ack Vectors are an almost inevitable necessity - RFC 4341 says that for CCID-2, Ack Vectors must be used. Furthermore, the code would be only interesting for coding - after some extensive testing with this patch set, having the debug code around is no longer of real help. Signed-off-by: Gerrit Renker --- net/dccp/Kconfig | 3 - net/dccp/Makefile | 5 +- net/dccp/ackvec.c | 251 ----------------------------------------- net/dccp/ackvec.h | 79 +------------ net/dccp/ccids/Kconfig | 1 - 5 files changed, 3 insertions(+), 336 deletions(-) diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index 7aa2a7acc7ec..206c16ad9c3c 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -25,9 +25,6 @@ config INET_DCCP_DIAG def_tristate y if (IP_DCCP = y && INET_DIAG = y) def_tristate m -config IP_DCCP_ACKVEC - bool - source "net/dccp/ccids/Kconfig" menu "DCCP Kernel Hacking" diff --git a/net/dccp/Makefile b/net/dccp/Makefile index f4f8793aafff..b68440bd7fa2 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o -dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o +dccp-y := ccid.o feat.o input.o minisocks.o options.o \ + output.o proto.o timer.o ackvec.o dccp_ipv4-y := ipv4.o @@ -8,8 +9,6 @@ dccp_ipv4-y := ipv4.o obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o dccp_ipv6-y := ipv6.o -dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o - obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index bf9cb7d7549d..66b8a51300c0 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -9,18 +9,10 @@ * under the terms of the GNU General Public License as published by the * Free Software Foundation; version 2 of the License; */ - -#include "ackvec.h" #include "dccp.h" - -#include -#include #include -#include #include -#include - static struct kmem_cache *dccp_ackvec_slab; static struct kmem_cache *dccp_ackvec_record_slab; @@ -281,249 +273,6 @@ void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) } } -/* - * If several packets are missing, the HC-Receiver may prefer to enter multiple - * bytes with run length 0, rather than a single byte with a larger run length; - * this simplifies table updates if one of the missing packets arrives. - */ -static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, - const unsigned int packets, - const unsigned char state) -{ - unsigned int gap; - long new_head; - - if (av->av_vec_len + packets > DCCPAV_MAX_ACKVEC_LEN) - return -ENOBUFS; - - gap = packets - 1; - new_head = av->av_buf_head - packets; - - if (new_head < 0) { - if (gap > 0) { - memset(av->av_buf, DCCPAV_NOT_RECEIVED, - gap + new_head + 1); - gap = -new_head; - } - new_head += DCCPAV_MAX_ACKVEC_LEN; - } - - av->av_buf_head = new_head; - - if (gap > 0) - memset(av->av_buf + av->av_buf_head + 1, - DCCPAV_NOT_RECEIVED, gap); - - av->av_buf[av->av_buf_head] = state; - av->av_vec_len += packets; - return 0; -} - -/* - * Implements the RFC 4340, Appendix A - */ -int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, - const u64 ackno, const u8 state) -{ - u8 *cur_head = av->av_buf + av->av_buf_head, - *buf_end = av->av_buf + DCCPAV_MAX_ACKVEC_LEN; - /* - * Check at the right places if the buffer is full, if it is, tell the - * caller to start dropping packets till the HC-Sender acks our ACK - * vectors, when we will free up space in av_buf. - * - * We may well decide to do buffer compression, etc, but for now lets - * just drop. - * - * From Appendix A.1.1 (`New Packets'): - * - * Of course, the circular buffer may overflow, either when the - * HC-Sender is sending data at a very high rate, when the - * HC-Receiver's acknowledgements are not reaching the HC-Sender, - * or when the HC-Sender is forgetting to acknowledge those acks - * (so the HC-Receiver is unable to clean up old state). In this - * case, the HC-Receiver should either compress the buffer (by - * increasing run lengths when possible), transfer its state to - * a larger buffer, or, as a last resort, drop all received - * packets, without processing them whatsoever, until its buffer - * shrinks again. - */ - - /* See if this is the first ackno being inserted */ - if (av->av_vec_len == 0) { - *cur_head = state; - av->av_vec_len = 1; - } else if (after48(ackno, av->av_buf_ackno)) { - const u64 delta = dccp_delta_seqno(av->av_buf_ackno, ackno); - - /* - * Look if the state of this packet is the same as the - * previous ackno and if so if we can bump the head len. - */ - if (delta == 1 && dccp_ackvec_state(cur_head) == state && - dccp_ackvec_runlen(cur_head) < DCCPAV_MAX_RUNLEN) - *cur_head += 1; - else if (dccp_ackvec_set_buf_head_state(av, delta, state)) - return -ENOBUFS; - } else { - /* - * A.1.2. Old Packets - * - * When a packet with Sequence Number S <= buf_ackno - * arrives, the HC-Receiver will scan the table for - * the byte corresponding to S. (Indexing structures - * could reduce the complexity of this scan.) - */ - u64 delta = dccp_delta_seqno(ackno, av->av_buf_ackno); - - while (1) { - const u8 len = dccp_ackvec_runlen(cur_head); - /* - * valid packets not yet in av_buf have a reserved - * entry, with a len equal to 0. - */ - if (*cur_head == DCCPAV_NOT_RECEIVED && delta == 0) { - dccp_pr_debug("Found %llu reserved seat!\n", - (unsigned long long)ackno); - *cur_head = state; - goto out; - } - /* len == 0 means one packet */ - if (delta < len + 1) - goto out_duplicate; - - delta -= len + 1; - if (++cur_head == buf_end) - cur_head = av->av_buf; - } - } - - av->av_buf_ackno = ackno; -out: - return 0; - -out_duplicate: - /* Duplicate packet */ - dccp_pr_debug("Received a dup or already considered lost " - "packet: %llu\n", (unsigned long long)ackno); - return -EILSEQ; -} - -static void dccp_ackvec_throw_record(struct dccp_ackvec *av, - struct dccp_ackvec_record *avr) -{ - struct dccp_ackvec_record *next; - - /* sort out vector length */ - if (av->av_buf_head <= avr->avr_ack_ptr) - av->av_vec_len = avr->avr_ack_ptr - av->av_buf_head; - else - av->av_vec_len = DCCPAV_MAX_ACKVEC_LEN - 1 - - av->av_buf_head + avr->avr_ack_ptr; - - /* free records */ - list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) { - list_del(&avr->avr_node); - kmem_cache_free(dccp_ackvec_record_slab, avr); - } -} - -void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, - const u64 ackno) -{ - struct dccp_ackvec_record *avr; - - /* - * If we traverse backwards, it should be faster when we have large - * windows. We will be receiving ACKs for stuff we sent a while back - * -sorbo. - */ - list_for_each_entry_reverse(avr, &av->av_records, avr_node) { - if (ackno == avr->avr_ack_seqno) { - dccp_pr_debug("%s ACK packet 0, len=%d, ack_seqno=%llu, " - "ack_ackno=%llu, ACKED!\n", - dccp_role(sk), avr->avr_ack_runlen, - (unsigned long long)avr->avr_ack_seqno, - (unsigned long long)avr->avr_ack_ackno); - dccp_ackvec_throw_record(av, avr); - break; - } else if (avr->avr_ack_seqno > ackno) - break; /* old news */ - } -} - -static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, - struct sock *sk, u64 *ackno, - const unsigned char len, - const unsigned char *vector) -{ - unsigned char i; - struct dccp_ackvec_record *avr; - - /* Check if we actually sent an ACK vector */ - if (list_empty(&av->av_records)) - return; - - i = len; - /* - * XXX - * I think it might be more efficient to work backwards. See comment on - * rcv_ackno. -sorbo. - */ - avr = list_entry(av->av_records.next, struct dccp_ackvec_record, avr_node); - while (i--) { - const u8 rl = dccp_ackvec_runlen(vector); - u64 ackno_end_rl; - - dccp_set_seqno(&ackno_end_rl, *ackno - rl); - - /* - * If our AVR sequence number is greater than the ack, go - * forward in the AVR list until it is not so. - */ - list_for_each_entry_from(avr, &av->av_records, avr_node) { - if (!after48(avr->avr_ack_seqno, *ackno)) - goto found; - } - /* End of the av_records list, not found, exit */ - break; -found: - if (between48(avr->avr_ack_seqno, ackno_end_rl, *ackno)) { - if (dccp_ackvec_state(vector) != DCCPAV_NOT_RECEIVED) { - dccp_pr_debug("%s ACK vector 0, len=%d, " - "ack_seqno=%llu, ack_ackno=%llu, " - "ACKED!\n", - dccp_role(sk), len, - (unsigned long long) - avr->avr_ack_seqno, - (unsigned long long) - avr->avr_ack_ackno); - dccp_ackvec_throw_record(av, avr); - break; - } - /* - * If it wasn't received, continue scanning... we might - * find another one. - */ - } - - dccp_set_seqno(ackno, ackno_end_rl - 1); - ++vector; - } -} - -int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, - u64 *ackno, const u8 opt, const u8 *value, const u8 len) -{ - if (len > DCCP_SINGLE_OPT_MAXLEN) - return -1; - - /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */ - dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk, - ackno, len, value); - return 0; -} - /** * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection * This routine is called when the peer acknowledges the receipt of Ack Vectors diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index 36ca2e9e5c84..db447503b636 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -64,7 +64,6 @@ static inline u8 dccp_ackvec_state(const u8 *cell) * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously) - * @av_veclen: length of the live portion of @av_buf */ struct dccp_ackvec { u8 av_buf[DCCPAV_MAX_ACKVEC_LEN]; @@ -75,7 +74,6 @@ struct dccp_ackvec { bool av_buf_nonce[DCCPAV_NUM_ACKVECS]; u8 av_overflow:1; struct list_head av_records; - u16 av_vec_len; }; /** struct dccp_ackvec_record - Records information about sent Ack Vectors @@ -101,25 +99,12 @@ struct dccp_ackvec_record { u8 avr_ack_nonce:1; }; -struct sock; -struct sk_buff; - -#ifdef CONFIG_IP_DCCP_ACKVEC -extern int dccp_ackvec_init(void); +extern int dccp_ackvec_init(void); extern void dccp_ackvec_exit(void); extern struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority); extern void dccp_ackvec_free(struct dccp_ackvec *av); -extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, - const u64 ackno, const u8 state); - -extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, - struct sock *sk, const u64 ackno); -extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, - u64 *ackno, const u8 opt, - const u8 *value, const u8 len); - extern void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb); extern int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum); extern void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno); @@ -129,66 +114,4 @@ static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) { return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; } -#else /* CONFIG_IP_DCCP_ACKVEC */ -static inline int dccp_ackvec_init(void) -{ - return 0; -} - -static inline void dccp_ackvec_exit(void) -{ -} - -static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) -{ - return NULL; -} - -static inline void dccp_ackvec_free(struct dccp_ackvec *av) -{ -} - -static inline void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb) -{ - -} - -static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, - const u64 ackno, const u8 state) -{ - return -1; -} - -static inline void dccp_ackvec_clear_state(struct dccp_ackvec *av, - const u64 ackno) -{ -} - -static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, - struct sock *sk, const u64 ackno) -{ -} - -static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, - const u64 *ackno, const u8 opt, - const u8 *value, const u8 len) -{ - return -1; -} - -static inline int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 nonce) -{ - return -1; -} - -static inline u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) -{ - return 0; -} - -static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) -{ - return true; -} -#endif /* CONFIG_IP_DCCP_ACKVEC */ #endif /* _ACKVEC_H */ diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 12275943eab8..44c7e90248fa 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -4,7 +4,6 @@ menu "DCCP CCIDs Configuration (EXPERIMENTAL)" config IP_DCCP_CCID2 tristate "CCID2 (TCP-Like) (EXPERIMENTAL)" def_tristate IP_DCCP - select IP_DCCP_ACKVEC ---help--- CCID 2, TCP-like Congestion Control, denotes Additive Increase, Multiplicative Decrease (AIMD) congestion control with behavior From c8bf462bc567c3dcb083ff95cc13060dd06f138c Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 074/105] dccp ccid-2: Separate option parsing from CCID processing This patch replaces an almost identical replication of code: large parts of dccp_parse_options() re-appeared as ccid2_ackvector() in ccid2.c. Apart from the duplication, this caused two more problems: 1. CCIDs should not need to be concerned with parsing header options; 2. one can not assume that Ack Vectors appear as a contiguous area within an skb, it is legal to insert other options and/or padding in between. The current code would throw an error and stop reading in such a case. The patch provides a new data structure and associated list housekeeping. Only small changes were necessary to integrate with CCID-2: data structure initialisation, adapt list traversal routine, and add call to the provided cleanup routine. The latter also lead to fixing the following BUG: CCID-2 so far ignored Ack Vectors on all packets other than Ack/DataAck, which is incorrect, since Ack Vectors can be present on any packet that has an Ack field. Details: -------- * received Ack Vectors are parsed by dccp_parse_options() alone, which passes the result on to the CCID-specific routine ccid_hc_tx_parse_options(); * CCIDs interested in using/decoding Ack Vector information will add code to fetch parsed Ack Vectors via this interface; * a data structure, `struct dccp_ackvec_parsed' is provided as interface; * this structure arranges Ack Vectors of the same skb into a FIFO order; * a doubly-linked list is used to keep the required FIFO code small. Signed-off-by: Gerrit Renker --- net/dccp/ackvec.c | 28 +++++++++ net/dccp/ackvec.h | 19 ++++++ net/dccp/ccids/ccid2.c | 135 +++++++++++++---------------------------- net/dccp/ccids/ccid2.h | 2 + net/dccp/options.c | 17 +++--- 5 files changed, 101 insertions(+), 100 deletions(-) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 66b8a51300c0..41819848bdda 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -343,6 +343,34 @@ free_records: } } +/* + * Routines to keep track of Ack Vectors received in an skb + */ +int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce) +{ + struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC); + + if (new == NULL) + return -ENOBUFS; + new->vec = vec; + new->len = len; + new->nonce = nonce; + + list_add_tail(&new->node, head); + return 0; +} +EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add); + +void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks) +{ + struct dccp_ackvec_parsed *cur, *next; + + list_for_each_entry_safe(cur, next, parsed_chunks, node) + kfree(cur); + INIT_LIST_HEAD(parsed_chunks); +} +EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); + int __init dccp_ackvec_init(void) { dccp_ackvec_slab = kmem_cache_create("dccp_ackvec", diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index db447503b636..6cdca79a99f7 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -114,4 +114,23 @@ static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av) { return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail; } + +/** + * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb + * @vec: start of vector (offset into skb) + * @len: length of @vec + * @nonce: whether @vec had an ECN nonce of 0 or 1 + * @node: FIFO - arranged in descending order of ack_ackno + * This structure is used by CCIDs to access Ack Vectors in a received skb. + */ +struct dccp_ackvec_parsed { + u8 *vec, + len, + nonce:1; + struct list_head node; +}; + +extern int dccp_ackvec_parsed_add(struct list_head *head, + u8 *vec, u8 len, u8 nonce); +extern void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks); #endif /* _ACKVEC_H */ diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 813d5cd40e8b..bbf16b35734d 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -317,68 +317,6 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) #endif } -/* XXX Lame code duplication! - * returns -1 if none was found. - * else returns the next offset to use in the function call. - */ -static int ccid2_ackvector(struct sock *sk, struct sk_buff *skb, int offset, - unsigned char **vec, unsigned char *veclen) -{ - const struct dccp_hdr *dh = dccp_hdr(skb); - unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb); - unsigned char *opt_ptr; - const unsigned char *opt_end = (unsigned char *)dh + - (dh->dccph_doff * 4); - unsigned char opt, len; - unsigned char *value; - - BUG_ON(offset < 0); - options += offset; - opt_ptr = options; - if (opt_ptr >= opt_end) - return -1; - - while (opt_ptr != opt_end) { - opt = *opt_ptr++; - len = 0; - value = NULL; - - /* Check if this isn't a single byte option */ - if (opt > DCCPO_MAX_RESERVED) { - if (opt_ptr == opt_end) - goto out_invalid_option; - - len = *opt_ptr++; - if (len < 3) - goto out_invalid_option; - /* - * Remove the type and len fields, leaving - * just the value size - */ - len -= 2; - value = opt_ptr; - opt_ptr += len; - - if (opt_ptr > opt_end) - goto out_invalid_option; - } - - switch (opt) { - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - *vec = value; - *veclen = len; - return offset + (opt_ptr - options); - } - } - - return -1; - -out_invalid_option: - DCCP_BUG("Invalid option - this should not happen (previous parsing)!"); - return -1; -} - static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); @@ -499,15 +437,27 @@ static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) ccid2_change_l_ack_ratio(sk, hctx->cwnd); } +static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type, + u8 option, u8 *optval, u8 optlen) +{ + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + + switch (option) { + case DCCPO_ACK_VECTOR_0: + case DCCPO_ACK_VECTOR_1: + return dccp_ackvec_parsed_add(&hctx->av_chunks, optval, optlen, + option - DCCPO_ACK_VECTOR_0); + } + return 0; +} + static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + struct dccp_ackvec_parsed *avp; u64 ackno, seqno; struct ccid2_seq *seqp; - unsigned char *vector; - unsigned char veclen; - int offset = 0; int done = 0; unsigned int maxincr = 0; @@ -542,17 +492,12 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) } /* check forward path congestion */ - /* still didn't send out new data packets */ - if (hctx->seqh == hctx->seqt) + if (dccp_packet_without_ack(skb)) return; - switch (DCCP_SKB_CB(skb)->dccpd_type) { - case DCCP_PKT_ACK: - case DCCP_PKT_DATAACK: - break; - default: - return; - } + /* still didn't send out new data packets */ + if (hctx->seqh == hctx->seqt) + goto done; ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq; if (after48(ackno, hctx->high_ack)) @@ -576,15 +521,16 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2); /* go through all ack vectors */ - while ((offset = ccid2_ackvector(sk, skb, offset, - &vector, &veclen)) != -1) { + list_for_each_entry(avp, &hctx->av_chunks, node) { /* go through this ack vector */ - while (veclen--) { - u64 ackno_end_rl = SUB48(ackno, dccp_ackvec_runlen(vector)); + for (; avp->len--; avp->vec++) { + u64 ackno_end_rl = SUB48(ackno, + dccp_ackvec_runlen(avp->vec)); - ccid2_pr_debug("ackvec start:%llu end:%llu\n", + ccid2_pr_debug("ackvec %llu |%u,%u|\n", (unsigned long long)ackno, - (unsigned long long)ackno_end_rl); + dccp_ackvec_state(avp->vec) >> 6, + dccp_ackvec_runlen(avp->vec)); /* if the seqno we are analyzing is larger than the * current ackno, then move towards the tail of our * seqnos. @@ -603,7 +549,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * run length */ while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) { - const u8 state = dccp_ackvec_state(vector); + const u8 state = dccp_ackvec_state(avp->vec); /* new packet received or marked */ if (state != DCCPAV_NOT_RECEIVED && @@ -630,7 +576,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) break; ackno = SUB48(ackno_end_rl, 1); - vector++; } if (done) break; @@ -694,6 +639,8 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) } ccid2_hc_tx_check_sanity(hctx); +done: + dccp_ackvec_parsed_cleanup(&hctx->av_chunks); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) @@ -727,6 +674,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) hctx->rpdupack = -1; hctx->last_cong = jiffies; setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); + INIT_LIST_HEAD(&hctx->av_chunks); ccid2_hc_tx_check_sanity(hctx); return 0; @@ -762,17 +710,18 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) } static struct ccid_operations ccid2 = { - .ccid_id = DCCPC_CCID2, - .ccid_name = "TCP-like", - .ccid_owner = THIS_MODULE, - .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), - .ccid_hc_tx_init = ccid2_hc_tx_init, - .ccid_hc_tx_exit = ccid2_hc_tx_exit, - .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, - .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, - .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, - .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), - .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, + .ccid_id = DCCPC_CCID2, + .ccid_name = "TCP-like", + .ccid_owner = THIS_MODULE, + .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), + .ccid_hc_tx_init = ccid2_hc_tx_init, + .ccid_hc_tx_exit = ccid2_hc_tx_exit, + .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet, + .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent, + .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options, + .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv, + .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock), + .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv, }; #ifdef CONFIG_IP_DCCP_CCID2_DEBUG diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index d7815804bceb..907deed255b4 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -47,6 +47,7 @@ struct ccid2_seq { * @lastrtt: time RTT was last measured * @rpseq: last consecutive seqno * @rpdupack: dupacks since rpseq + * @av_chunks: list of Ack Vectors received on current skb */ struct ccid2_hc_tx_sock { u32 cwnd; @@ -66,6 +67,7 @@ struct ccid2_hc_tx_sock { int rpdupack; unsigned long last_cong; u64 high_ack; + struct list_head av_chunks; }; struct ccid2_hc_rx_sock { diff --git a/net/dccp/options.c b/net/dccp/options.c index 791e07853a79..e5a32979d7d7 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -128,13 +128,6 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, if (rc) goto out_featneg_failed; break; - case DCCPO_ACK_VECTOR_0: - case DCCPO_ACK_VECTOR_1: - if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ - break; - dccp_pr_debug("%s Ack Vector (len=%u)\n", dccp_role(sk), - len); - break; case DCCPO_TIMESTAMP: if (len != 4) goto out_invalid_option; @@ -224,6 +217,16 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, pkt_type, opt, value, len)) goto out_invalid_option; break; + case DCCPO_ACK_VECTOR_0: + case DCCPO_ACK_VECTOR_1: + if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ + break; + /* + * Ack vectors are processed by the TX CCID if it is + * interested. The RX CCID need not parse Ack Vectors, + * since it is only interested in clearing old state. + * Fall through. + */ case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, pkt_type, opt, value, len)) From f4a66ca4d2ff093c0f9111b449a248ffb8209b4d Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 075/105] dccp: Return-value convention of hc_tx_send_packet() This patch reorganises the return value convention of the CCID TX sending function, to permit more flexible schemes, as required by subsequent patches. Currently the convention is * values < 0 mean error, * a value == 0 means "send now", and * a value x > 0 means "send in x milliseconds". The patch provides symbolic constants and a function to interpret return values. In addition, it caps the maximum positive return value to 0xFFFF milliseconds, corresponding to 65.535 seconds. This is possible since in CCID-3 the maximum inter-packet gap is t_mbi = 64 sec. Signed-off-by: Gerrit Renker --- net/dccp/ccid.h | 34 +++++++++++++++++++++++++++++++--- net/dccp/ccids/ccid3.c | 12 ++++++------ 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 430a3c24f970..d27054ba2159 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -129,13 +129,41 @@ static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); +/* + * Congestion control of queued data packets via CCID decision. + * + * The TX CCID performs its congestion-control by indicating whether and when a + * queued packet may be sent, using the return code of ccid_hc_tx_send_packet(). + * The following modes are supported via the symbolic constants below: + * - timer-based pacing (CCID returns a delay value in milliseconds); + * - autonomous dequeueing (CCID internally schedules dccps_xmitlet). + */ + +enum ccid_dequeueing_decision { + CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */ + CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */ + CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */ + CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */ + CCID_PACKET_ERR = 0xF0000, /* error condition */ +}; + +static inline int ccid_packet_dequeue_eval(const int return_code) +{ + if (return_code < 0) + return CCID_PACKET_ERR; + if (return_code == 0) + return CCID_PACKET_SEND_AT_ONCE; + if (return_code <= CCID_PACKET_DELAY_MAX) + return CCID_PACKET_DELAY; + return return_code; +} + static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, struct sk_buff *skb) { - int rc = 0; if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL) - rc = ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); - return rc; + return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb); + return CCID_PACKET_SEND_AT_ONCE; } static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 0751a8fa936a..0d406f82e433 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -269,11 +269,11 @@ out: sock_put(sk); } -/* - * returns - * > 0: delay (in msecs) that should pass before actually sending - * = 0: can send immediately - * < 0: error condition; do not send packet +/** + * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets + * @skb: next packet candidate to send on @sk + * This function uses the convention of ccid_packet_dequeue_eval() and + * returns a millisecond-delay value between 0 and t_mbi = 64000 msec. */ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { @@ -349,7 +349,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) /* set the nominal send time for the next following packet */ hctx->t_nom = ktime_add_us(hctx->t_nom, hctx->t_ipi); - return 0; + return CCID_PACKET_SEND_AT_ONCE; } static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) From e7937772d7a2b0127cc4cbc67bc594e139fdaf63 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 076/105] dccp: Extend CCID packet dequeueing interface This extends the packet dequeuing interface of dccp_write_xmit() to allow 1. CCIDs to take care of timing when the next packet may be sent; 2. delayed sending (as before, with an inter-packet gap up to 65.535 seconds). The main purpose is to take CCID2 out of its polling mode (when it is network- limited, it tries every millisecond to send, without interruption). The interface can also be used to support other CCIDs. The mode of operation for (2) is as follows: * new packet is enqueued via dccp_sendmsg() => dccp_write_xmit(), * ccid_hc_tx_send_packet() detects that it may not send (e.g. window full), * it signals this condition via `CCID_PACKET_WILL_DEQUEUE_LATER', * dccp_write_xmit() returns without further action; * after some time the wait-condition for CCID becomes true, * that CCID schedules the tasklet, * tasklet function calls ccid_hc_tx_send_packet() via dccp_write_xmit(), * since the wait-condition is now true, ccid_hc_tx_packet() returns "send now", * packet is sent, and possibly more (since dccp_write_xmit() loops). Code reuse: the taskled function calls dccp_write_xmit(), the timer function reduces to a wrapper around the same code. If the tasklet finds that the socket is locked, it re-schedules the tasklet function (not the tasklet) after one jiffy. Changed DCCP_BUG to dccp_pr_debug when transmit_skb returns an error (e.g. when a local qdisc is used, NET_XMIT_DROP=1 can be returned for many packets). Signed-off-by: Gerrit Renker --- include/linux/dccp.h | 4 +- net/dccp/output.c | 129 +++++++++++++++++++++++++++---------------- net/dccp/timer.c | 25 +++++---- 3 files changed, 98 insertions(+), 60 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 83197b601a4f..eed52bcd35d0 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -463,7 +463,8 @@ struct dccp_ackvec; * @dccps_hc_tx_insert_options - sender wants to add options when sending * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" - * @dccps_xmit_timer - timer for when CCID is not ready to send + * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets + * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) */ struct dccp_sock { @@ -504,6 +505,7 @@ struct dccp_sock { __u8 dccps_hc_tx_insert_options:1; __u8 dccps_server_timewait:1; __u8 dccps_sync_scheduled:1; + struct tasklet_struct dccps_xmitlet; struct timer_list dccps_xmit_timer; }; diff --git a/net/dccp/output.c b/net/dccp/output.c index bfda071559f4..9afd58e39e23 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -251,65 +251,98 @@ do_interrupted: goto out; } +/** + * dccp_xmit_packet - Send data packet under control of CCID + * Transmits next-queued payload and informs CCID to account for the packet. + */ +static void dccp_xmit_packet(struct sock *sk) +{ + int err, len; + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb = skb_dequeue(&sk->sk_write_queue); + + if (unlikely(skb == NULL)) + return; + len = skb->len; + + if (sk->sk_state == DCCP_PARTOPEN) { + const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; + /* + * See 8.1.5 - Handshake Completion. + * + * For robustness we resend Confirm options until the client has + * entered OPEN. During the initial feature negotiation, the MPS + * is smaller than usual, reduced by the Change/Confirm options. + */ + if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { + DCCP_WARN("Payload too large (%d) for featneg.\n", len); + dccp_send_ack(sk); + dccp_feat_list_purge(&dp->dccps_featneg); + } + + inet_csk_schedule_ack(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + inet_csk(sk)->icsk_rto, + DCCP_RTO_MAX); + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; + } else if (dccp_ack_pending(sk)) { + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK; + } else { + DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA; + } + + err = dccp_transmit_skb(sk, skb); + if (err) + dccp_pr_debug("transmit_skb() returned err=%d\n", err); + /* + * Register this one as sent even if an error occurred. To the remote + * end a local packet drop is indistinguishable from network loss, i.e. + * any local drop will eventually be reported via receiver feedback. + */ + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); + + /* + * If the CCID needs to transfer additional header options out-of-band + * (e.g. Ack Vectors or feature-negotiation options), it activates this + * flag to schedule a Sync. The Sync will automatically incorporate all + * currently pending header options, thus clearing the backlog. + */ + if (dp->dccps_sync_scheduled) + dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); +} + void dccp_write_xmit(struct sock *sk, int block) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; while ((skb = skb_peek(&sk->sk_write_queue))) { - int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); + int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - if (err > 0) { + switch (ccid_packet_dequeue_eval(rc)) { + case CCID_PACKET_WILL_DEQUEUE_LATER: + return; + case CCID_PACKET_DELAY: if (!block) { sk_reset_timer(sk, &dp->dccps_xmit_timer, - msecs_to_jiffies(err)+jiffies); + msecs_to_jiffies(rc)+jiffies); + return; + } + rc = dccp_wait_for_ccid(sk, skb, rc); + if (rc && rc != -EINTR) { + DCCP_BUG("err=%d after dccp_wait_for_ccid", rc); + skb_dequeue(&sk->sk_write_queue); + kfree_skb(skb); break; - } else - err = dccp_wait_for_ccid(sk, skb, err); - if (err && err != -EINTR) - DCCP_BUG("err=%d after dccp_wait_for_ccid", err); - } - - skb_dequeue(&sk->sk_write_queue); - if (err == 0) { - struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); - const int len = skb->len; - - if (sk->sk_state == DCCP_PARTOPEN) { - const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD; - /* - * See 8.1.5 - Handshake Completion. - * - * For robustness we resend Confirm options until the client has - * entered OPEN. During the initial feature negotiation, the MPS - * is smaller than usual, reduced by the Change/Confirm options. - */ - if (!list_empty(&dp->dccps_featneg) && len > cur_mps) { - DCCP_WARN("Payload too large (%d) for featneg.\n", len); - dccp_send_ack(sk); - dccp_feat_list_purge(&dp->dccps_featneg); - } - - inet_csk_schedule_ack(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - inet_csk(sk)->icsk_rto, - DCCP_RTO_MAX); - dcb->dccpd_type = DCCP_PKT_DATAACK; - } else if (dccp_ack_pending(sk)) - dcb->dccpd_type = DCCP_PKT_DATAACK; - else - dcb->dccpd_type = DCCP_PKT_DATA; - - err = dccp_transmit_skb(sk, skb); - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); - if (err) - DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", - err); - if (dp->dccps_sync_scheduled) - dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); - } else { - dccp_pr_debug("packet discarded due to err=%d\n", err); + } + /* fall through */ + case CCID_PACKET_SEND_AT_ONCE: + dccp_xmit_packet(sk); + break; + case CCID_PACKET_ERR: + skb_dequeue(&sk->sk_write_queue); kfree_skb(skb); + dccp_pr_debug("packet discarded due to err=%d\n", rc); } } } diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 162d1e683c39..9369aca4b0e9 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -237,32 +237,35 @@ out: sock_put(sk); } -/* Transmit-delay timer: used by the CCIDs to delay actual send time */ -static void dccp_write_xmit_timer(unsigned long data) +/** + * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface + * See the comments above %ccid_dequeueing_decision for supported modes. + */ +static void dccp_write_xmitlet(unsigned long data) { struct sock *sk = (struct sock *)data; - struct dccp_sock *dp = dccp_sk(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) - sk_reset_timer(sk, &dp->dccps_xmit_timer, jiffies+1); + sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); else dccp_write_xmit(sk, 0); bh_unlock_sock(sk); - sock_put(sk); } -static void dccp_init_write_xmit_timer(struct sock *sk) +static void dccp_write_xmit_timer(unsigned long data) { - struct dccp_sock *dp = dccp_sk(sk); - - setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, - (unsigned long)sk); + dccp_write_xmitlet(data); + sock_put((struct sock *)data); } void dccp_init_xmit_timers(struct sock *sk) { - dccp_init_write_xmit_timer(sk); + struct dccp_sock *dp = dccp_sk(sk); + + tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk); + setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer, + (unsigned long)sk); inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer, &dccp_keepalive_timer); } From 146993cf5174472644ed11bd5fb539f0af8bfa49 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 077/105] dccp: Refine the wait-for-ccid mechanism This extends the existing wait-for-ccid routine so that it may be used with different types of CCID. It further addresses the problems listed below. The code looks if the write queue is non-empty and grants the TX CCID up to `timeout' jiffies to drain the queue. It will instead purge that queue if * the delay suggested by the CCID exceeds the time budget; * a socket error occurred while waiting for the CCID; * there is a signal pending (eg. annoyed user pressed Control-C); * the CCID does not support delays (we don't know how long it will take). D e t a i l s [can be removed] ------------------------------- DCCP's sending mechanism functions a bit like non-blocking I/O: dccp_sendmsg() will enqueue up to net.dccp.default.tx_qlen packets (default=5), without waiting for them to be released to the network. Rate-based CCIDs, such as CCID3/4, can impose sending delays of up to maximally 64 seconds (t_mbi in RFC 3448). Hence the write queue may still contain packets when the application closes. Since the write queue is congestion-controlled by the CCID, draining the queue is also under control of the CCID. There are several problems that needed to be addressed: 1) The queue-drain mechanism only works with rate-based CCIDs. If CCID2 for example has a full TX queue and becomes network-limited just as the application wants to close, then waiting for CCID2 to become unblocked could lead to an indefinite delay (i.e., application "hangs"). 2) Since each TX CCID in turn uses a feedback mechanism, there may be changes in its sending policy while the queue is being drained. This can lead to further delays during which the application will not be able to terminate. 3) The minimum wait time for CCID3/4 can be expected to be the queue length times the current inter-packet delay. For example if tx_qlen=100 and a delay of 15 ms is used for each packet, then the application would have to wait for a minimum of 1.5 seconds before being allowed to exit. 4) There is no way for the user/application to control this behaviour. It would be good to use the timeout argument of dccp_close() as an upper bound. Then the maximum time that an application is willing to wait for its CCIDs to can be set via the SO_LINGER option. These problems are addressed by giving the CCID a grace period of up to the `timeout' value. The wait-for-ccid function is, as before, used when the application (a) has read all the data in its receive buffer and (b) if SO_LINGER was set with a non-zero linger time, or (c) the socket is either in the OPEN (active close) or in the PASSIVE_CLOSEREQ state (client application closes after receiving CloseReq). In addition, there is a catch-all case by calling __skb_queue_purge() after waiting for the CCID. This is necessary since the write queue may still have data when (a) the host has been passively-closed, (b) abnormal termination (unread data, zero linger time), (c) wait-for-ccid could not finish within the given time limit. Signed-off-by: Gerrit Renker --- net/dccp/dccp.h | 3 +- net/dccp/output.c | 113 ++++++++++++++++++++++++++-------------------- net/dccp/proto.c | 15 +++++- net/dccp/timer.c | 2 +- 4 files changed, 81 insertions(+), 52 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 1e65378eea3f..74c90cd27677 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -234,8 +234,9 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, extern void dccp_send_sync(struct sock *sk, const u64 seq, const enum dccp_pkt_type pkt_type); -extern void dccp_write_xmit(struct sock *sk, int block); +extern void dccp_write_xmit(struct sock *sk); extern void dccp_write_space(struct sock *sk); +extern void dccp_flush_write_queue(struct sock *sk, long *time_budget); extern void dccp_init_xmit_timers(struct sock *sk); static inline void dccp_clear_xmit_timers(struct sock *sk) diff --git a/net/dccp/output.c b/net/dccp/output.c index 9afd58e39e23..39056dc61355 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -206,49 +206,29 @@ void dccp_write_space(struct sock *sk) } /** - * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet + * dccp_wait_for_ccid - Await CCID send permission * @sk: socket to wait for - * @skb: current skb to pass on for waiting - * @delay: sleep timeout in milliseconds (> 0) - * This function is called by default when the socket is closed, and - * when a non-zero linger time is set on the socket. For consistency + * @delay: timeout in jiffies + * This is used by CCIDs which need to delay the send time in process context. */ -static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb, int delay) +static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay) { - struct dccp_sock *dp = dccp_sk(sk); DEFINE_WAIT(wait); - unsigned long jiffdelay; - int rc; + long remaining; - do { - dccp_pr_debug("delayed send by %d msec\n", delay); - jiffdelay = msecs_to_jiffies(delay); + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + sk->sk_write_pending++; + release_sock(sk); - prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + remaining = schedule_timeout(delay); - sk->sk_write_pending++; - release_sock(sk); - schedule_timeout(jiffdelay); - lock_sock(sk); - sk->sk_write_pending--; - - if (sk->sk_err) - goto do_error; - if (signal_pending(current)) - goto do_interrupted; - - rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); - } while ((delay = rc) > 0); -out: + lock_sock(sk); + sk->sk_write_pending--; finish_wait(sk->sk_sleep, &wait); - return rc; -do_error: - rc = -EPIPE; - goto out; -do_interrupted: - rc = -EINTR; - goto out; + if (signal_pending(current) || sk->sk_err) + return -1; + return remaining; } /** @@ -311,7 +291,53 @@ static void dccp_xmit_packet(struct sock *sk) dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC); } -void dccp_write_xmit(struct sock *sk, int block) +/** + * dccp_flush_write_queue - Drain queue at end of connection + * Since dccp_sendmsg queues packets without waiting for them to be sent, it may + * happen that the TX queue is not empty at the end of a connection. We give the + * HC-sender CCID a grace period of up to @time_budget jiffies. If this function + * returns with a non-empty write queue, it will be purged later. + */ +void dccp_flush_write_queue(struct sock *sk, long *time_budget) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct sk_buff *skb; + long delay, rc; + + while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) { + rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); + + switch (ccid_packet_dequeue_eval(rc)) { + case CCID_PACKET_WILL_DEQUEUE_LATER: + /* + * If the CCID determines when to send, the next sending + * time is unknown or the CCID may not even send again + * (e.g. remote host crashes or lost Ack packets). + */ + DCCP_WARN("CCID did not manage to send all packets\n"); + return; + case CCID_PACKET_DELAY: + delay = msecs_to_jiffies(rc); + if (delay > *time_budget) + return; + rc = dccp_wait_for_ccid(sk, delay); + if (rc < 0) + return; + *time_budget -= (delay - rc); + /* check again if we can send now */ + break; + case CCID_PACKET_SEND_AT_ONCE: + dccp_xmit_packet(sk); + break; + case CCID_PACKET_ERR: + skb_dequeue(&sk->sk_write_queue); + kfree_skb(skb); + dccp_pr_debug("packet discarded due to err=%ld\n", rc); + } + } +} + +void dccp_write_xmit(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; @@ -323,19 +349,9 @@ void dccp_write_xmit(struct sock *sk, int block) case CCID_PACKET_WILL_DEQUEUE_LATER: return; case CCID_PACKET_DELAY: - if (!block) { - sk_reset_timer(sk, &dp->dccps_xmit_timer, - msecs_to_jiffies(rc)+jiffies); - return; - } - rc = dccp_wait_for_ccid(sk, skb, rc); - if (rc && rc != -EINTR) { - DCCP_BUG("err=%d after dccp_wait_for_ccid", rc); - skb_dequeue(&sk->sk_write_queue); - kfree_skb(skb); - break; - } - /* fall through */ + sk_reset_timer(sk, &dp->dccps_xmit_timer, + jiffies + msecs_to_jiffies(rc)); + return; case CCID_PACKET_SEND_AT_ONCE: dccp_xmit_packet(sk); break; @@ -660,7 +676,6 @@ void dccp_send_close(struct sock *sk, const int active) DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; if (active) { - dccp_write_xmit(sk, 1); dccp_skb_entail(sk, skb); dccp_transmit_skb(sk, skb_clone(skb, prio)); /* diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 11905e0cf8f7..8c125ffab1c5 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -735,7 +735,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto out_discard; skb_queue_tail(&sk->sk_write_queue, skb); - dccp_write_xmit(sk,0); + dccp_write_xmit(sk); out_release: release_sock(sk); return rc ? : len; @@ -958,9 +958,22 @@ void dccp_close(struct sock *sk, long timeout) /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); } else if (sk->sk_state != DCCP_CLOSED) { + /* + * Normal connection termination. May need to wait if there are + * still packets in the TX queue that are delayed by the CCID. + */ + dccp_flush_write_queue(sk, &timeout); dccp_terminate_connection(sk); } + /* + * Flush write queue. This may be necessary in several cases: + * - we have been closed by the peer but still have application data; + * - abortive termination (unread data or zero linger time), + * - normal termination but queue could not be flushed within time limit + */ + __skb_queue_purge(&sk->sk_write_queue); + sk_stream_wait_close(sk, timeout); adjudge_to_death: diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 9369aca4b0e9..e02d5a94f4c0 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -249,7 +249,7 @@ static void dccp_write_xmitlet(unsigned long data) if (sock_owned_by_user(sk)) sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1); else - dccp_write_xmit(sk, 0); + dccp_write_xmit(sk); bh_unlock_sock(sk); } From 83337dae6ca94d801b6700600244865cd694205b Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 078/105] dccp ccid-2: Stop polling This updates CCID2 to use the CCID dequeuing mechanism, converting from previous constant-polling to a now event-driven mechanism. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid2.c | 21 +++++++++++++-------- net/dccp/ccids/ccid2.h | 5 +++++ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index bbf16b35734d..c7d83e3c1648 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -123,12 +123,9 @@ static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) { - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - - if (hctx->pipe < hctx->cwnd) - return 0; - - return 1; /* XXX CCID should dequeue when ready instead of polling */ + if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk))) + return CCID_PACKET_WILL_DEQUEUE_LATER; + return CCID_PACKET_SEND_AT_ONCE; } static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) @@ -168,6 +165,7 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) { struct sock *sk = (struct sock *)data; struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); long s; bh_lock_sock(sk); @@ -187,8 +185,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) if (s > 60) hctx->rto = 60 * HZ; - ccid2_start_rto_timer(sk); - /* adjust pipe, cwnd etc */ hctx->ssthresh = hctx->cwnd / 2; if (hctx->ssthresh < 2) @@ -205,6 +201,11 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) hctx->rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); ccid2_hc_tx_check_sanity(hctx); + + /* if we were blocked before, we may now send cwnd=1 packet */ + if (sender_was_blocked) + tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); + ccid2_start_rto_timer(sk); out: bh_unlock_sock(sk); sock_put(sk); @@ -455,6 +456,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); struct dccp_ackvec_parsed *avp; u64 ackno, seqno; struct ccid2_seq *seqp; @@ -640,6 +642,9 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) ccid2_hc_tx_check_sanity(hctx); done: + /* check if incoming Acks allow pending packets to be sent */ + if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx)) + tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); dccp_ackvec_parsed_cleanup(&hctx->av_chunks); } diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 907deed255b4..d5900dd5d4f4 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -70,6 +70,11 @@ struct ccid2_hc_tx_sock { struct list_head av_chunks; }; +static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hctx) +{ + return (hctx->pipe >= hctx->cwnd); +} + struct ccid2_hc_rx_sock { int data; }; From c6f0f2e71f3088a0f05502d6adb0f667b84028c3 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 079/105] dccp ccid-2: Remove redundant sanity tests This removes the ccid2_hc_tx_check_sanity function: it is redundant. Details: ======== The tx_check_sanity function performs three tests: 1) it checks that the circular TX list is sorted - in ascending order of sequence number (ccid2s_seq) - and time (ccid2s_sent), - in the direction from `tail' (hctx_seqt) to `head' (hctx_seqh); 2) it ensures that the entire list has the length seqbufc * CCID2_SEQBUF_LEN; 3) it ensures that pipe equals the number of packets that were not marked `acked' (ccid2s_acked) between `tail' and `head'. The following argues that each of these tests is redundant, this can be verified by going through the code. (1) is not necessary, since both time and GSS increase from one packet to the next, so that subsequent insertions in tx_packet_sent (which advance the `head' pointer) will be in ascending order of time and sequence number. In (2), the length of the list is always equal to seqbufc times CCID2_SEQBUF_LEN (set to 1024) unless allocation caused an earlier failure, because: * at initialisation (tx_init), there is one chunk of size 1024 and seqbufc=1; * subsequent calls to tx_alloc_seq take place whenever head->next == tail in tx_packet_sent; then a new chunk of size 1024 is inserted between head and tail, and seqbufc is incremented by one. To show that (3) is redundant requires looking at two cases. The `pipe' variable of the TX socket is incremented only in tx_packet_sent, and decremented in tx_packet_recv. When head == tail (TX history empty) then pipe should be 0, which is the case directly after initialisation and after a retransmission timeout has occurred (ccid2_hc_tx_rto_expire). The first case involves parsing Ack Vectors for packets recorded in the live portion of the buffer, between tail and head. For each packet marked by the receiver as received (state 0) or ECN-marked (state 1), pipe is decremented by one, so for all such packets the BUG_ON in tx_check_sanity will not trigger. The second case is the loss detection in the second half of tx_packet_recv, below the comment "Check for NUMDUPACK". The first while-loop here ensures that the sequence number of `seqp' is either above or equal to `high_ack', or otherwise equal to the highest sequence number sent so far (of the entry head->prev, as head points to the next unsent entry). The next while-loop ("while (1)") counts the number of acked packets starting from that position of seqp, going backwards in the direction from head->prev to tail. If NUMDUPACK=3 such packets were counted within this loop, `seqp' points to the last acknowledged packet of these, and the "if (done == NUMDUPACK)" block is entered next. The while-loop contained within that block in turn traverses the list backwards, from head to tail; the position of `seqp' is saved in the variable `last_acked'. For each packet not marked as `acked', a congestion event is triggered within the loop, and pipe is decremented. The loop terminates when `seqp' has reached `tail', whereupon tail is set to the position previously stored in `last_acked'. Thus, between `last_acked' and the previous position of `tail', - pipe has been decremented earlier if the packet was marked as state 0 or 1; - pipe was decremented if the packet was not marked as acked. That is, pipe has been decremented by the number of packets between `last_acked' and the previous position of `tail'. As a consequence, pipe now again reflects the number of packets which have not (yet) been acked between the new position of tail (at `last_acked') and head->prev, or 0 if head==tail. The result is that the BUG_ON condition in check_sanity will also not be triggered, hence the test (3) is also redundant. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid2.c | 51 ------------------------------------------ 1 file changed, 51 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index c7d83e3c1648..3b2548bd73f3 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -34,51 +34,8 @@ #ifdef CONFIG_IP_DCCP_CCID2_DEBUG static int ccid2_debug; #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) - -static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hctx) -{ - int len = 0; - int pipe = 0; - struct ccid2_seq *seqp = hctx->seqh; - - /* there is data in the chain */ - if (seqp != hctx->seqt) { - seqp = seqp->ccid2s_prev; - len++; - if (!seqp->ccid2s_acked) - pipe++; - - while (seqp != hctx->seqt) { - struct ccid2_seq *prev = seqp->ccid2s_prev; - - len++; - if (!prev->ccid2s_acked) - pipe++; - - /* packets are sent sequentially */ - BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq, - prev->ccid2s_seq ) >= 0); - BUG_ON(time_before(seqp->ccid2s_sent, - prev->ccid2s_sent)); - - seqp = prev; - } - } - - BUG_ON(pipe != hctx->pipe); - ccid2_pr_debug("len of chain=%d\n", len); - - do { - seqp = seqp->ccid2s_prev; - len++; - } while (seqp != hctx->seqh); - - ccid2_pr_debug("total len=%d\n", len); - BUG_ON(len != hctx->seqbufc * CCID2_SEQBUF_LEN); -} #else #define ccid2_pr_debug(format, a...) -#define ccid2_hc_tx_check_sanity(hctx) #endif static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hctx) @@ -176,8 +133,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) ccid2_pr_debug("RTO_EXPIRE\n"); - ccid2_hc_tx_check_sanity(hctx); - /* back-off timer */ hctx->rto <<= 1; @@ -200,7 +155,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) hctx->rpseq = 0; hctx->rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); - ccid2_hc_tx_check_sanity(hctx); /* if we were blocked before, we may now send cwnd=1 packet */ if (sender_was_blocked) @@ -314,7 +268,6 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) } } while (0); ccid2_pr_debug("=========\n"); - ccid2_hc_tx_check_sanity(hctx); #endif } @@ -463,7 +416,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) int done = 0; unsigned int maxincr = 0; - ccid2_hc_tx_check_sanity(hctx); /* check reverse path congestion */ seqno = DCCP_SKB_CB(skb)->dccpd_seq; @@ -640,7 +592,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hctx->seqt = hctx->seqt->ccid2s_next; } - ccid2_hc_tx_check_sanity(hctx); done: /* check if incoming Acks allow pending packets to be sent */ if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx)) @@ -680,8 +631,6 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) hctx->last_cong = jiffies; setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); INIT_LIST_HEAD(&hctx->av_chunks); - - ccid2_hc_tx_check_sanity(hctx); return 0; } From e9803c0104564698d3b8e84ccdb0b8b0e65427e2 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 080/105] dccp ccid-2: Simplify dec_pipe and rearming of RTO timer This removes the dec_pipe function and improves the way the RTO timer is rearmed when a new acknowledgment comes in. Details and justification for removal: -------------------------------------- 1) The BUG_ON in dec_pipe is never triggered: pipe is only decremented for TX history entries between tail and head, for which it had previously been incremented in tx_packet_sent; and it is not decremented twice for the same entry, since it is - either decremented when a corresponding Ack Vector cell in state 0 or 1 was received (and then ccid2s_acked==1), - or it is decremented when ccid2s_acked==0, as part of the loss detection in tx_packet_recv (and hence it can not have been decremented earlier). 2) Restarting the RTO timer happens for every single entry in each Ack Vector parsed by tx_packet_recv (according to RFC 4340, 11.4 this can happen up to 16192 times per Ack Vector). 3) The RTO timer should not be restarted when all outstanding data has been acknowledged. This is currently done similar to (2), in dec_pipe, when pipe has reached 0. The patch onsolidates the code which rearms the RTO timer, combining the segments from new_ack and dec_pipe. As a result, the code becomes clearer (compare with tcp_rearm_rto()). Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid2.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 3b2548bd73f3..fa074d442065 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -353,23 +353,6 @@ static inline void ccid2_new_ack(struct sock *sk, hctx->srtt, hctx->rttvar, hctx->rto, HZ, r); } - - /* we got a new ack, so re-start RTO timer */ - ccid2_hc_tx_kill_rto_timer(sk); - ccid2_start_rto_timer(sk); -} - -static void ccid2_hc_tx_dec_pipe(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - - if (hctx->pipe == 0) - DCCP_BUG("pipe == 0"); - else - hctx->pipe--; - - if (hctx->pipe == 0) - ccid2_hc_tx_kill_rto_timer(sk); } static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) @@ -518,7 +501,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) seqp->ccid2s_acked = 1; ccid2_pr_debug("Got ack for %llu\n", (unsigned long long)seqp->ccid2s_seq); - ccid2_hc_tx_dec_pipe(sk); + hctx->pipe--; } if (seqp == hctx->seqt) { done = 1; @@ -574,7 +557,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * one ack vector. */ ccid2_congestion_event(sk, seqp); - ccid2_hc_tx_dec_pipe(sk); + hctx->pipe--; } if (seqp == hctx->seqt) break; @@ -592,6 +575,12 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hctx->seqt = hctx->seqt->ccid2s_next; } + /* restart RTO timer if not all outstanding data has been acked */ + if (hctx->pipe == 0) + sk_stop_timer(sk, &hctx->rtotimer); + else + sk_reset_timer(sk, &hctx->rtotimer, + jiffies + hctx->rto); done: /* check if incoming Acks allow pending packets to be sent */ if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx)) From 1435562d7e0412e4885b661843f69859013f9d25 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 081/105] dccp ccid-2: Replace broken RTT estimator with better algorithm The current CCID-2 RTT estimator code is in parts broken and lags behind the suggestions in RFC2988 of using scaled variants for SRTT/RTTVAR. That code is replaced by the present patch, which reuses the Linux TCP RTT estimator code - reasons for this code duplication are given below. Further details: ---------------- 1. The minimum RTO of previously one second has been replaced with TCP's, since RFC4341, sec. 5 says that the minimum of 1 sec. (suggested in RFC2988, 2.4) is not necessary. Instead, the TCP_RTO_MIN is used, which agrees with DCCP's concept of a default RTT (RFC 4340, 3.4). 2. The maximum RTO has been set to DCCP_RTO_MAX (64 sec), which agrees with RFC2988, (2.5). 3. De-inlined the function ccid2_new_ack(). 4. Added a FIXME: the RTT is sampled several times per Ack Vector, which will give the wrong estimate. It should be replaced with one sample per Ack. However, at the moment this can not be resolved easily, since - it depends on TX history code (which also needs some work), - the cleanest solution is not to use the `sent' time at all (saves 4 bytes per entry) and use DCCP timestamps / elapsed time to estimated the RTT, which however is non-trivial to get right (but needs to be done). Reasons for reusing the Linux TCP estimator algorithm: ------------------------------------------------------ Some time was spent to find a better alternative, using basic RFC2988 as a first step. Further analysis and experimentation showed that the Linux TCP RTO estimator is superior to a basic RFC2988 implementation. A summary is on http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/ccid2/rto_estimator/ In addition, this estimator fared well in a recent empirical evaluation: Rewaskar, Sushant, Jasleen Kaur and F. Donelson Smith. A Performance Study of Loss Detection/Recovery in Real-world TCP Implementations. Proceedings of 15th IEEE International Conference on Network Protocols (ICNP-07). 2007. Thus there is significant benefit in reusing the existing TCP code. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid2.c | 171 ++++++++++++++++++++++------------------- net/dccp/ccids/ccid2.h | 18 +++-- 2 files changed, 107 insertions(+), 82 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index fa074d442065..22753fd98698 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -110,12 +110,6 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) dp->dccps_l_ack_ratio = val; } -static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hctx, long val) -{ - ccid2_pr_debug("change SRTT to %ld\n", val); - hctx->srtt = val; -} - static void ccid2_start_rto_timer(struct sock *sk); static void ccid2_hc_tx_rto_expire(unsigned long data) @@ -123,7 +117,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) struct sock *sk = (struct sock *)data; struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); const bool sender_was_blocked = ccid2_cwnd_network_limited(hctx); - long s; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -135,10 +128,8 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) /* back-off timer */ hctx->rto <<= 1; - - s = hctx->rto / HZ; - if (s > 60) - hctx->rto = 60 * HZ; + if (hctx->rto > DCCP_RTO_MAX) + hctx->rto = DCCP_RTO_MAX; /* adjust pipe, cwnd etc */ hctx->ssthresh = hctx->cwnd / 2; @@ -279,9 +270,87 @@ static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) ccid2_pr_debug("deleted RTO timer\n"); } -static inline void ccid2_new_ack(struct sock *sk, - struct ccid2_seq *seqp, - unsigned int *maxincr) +/** + * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm + * This code is almost identical with TCP's tcp_rtt_estimator(), since + * - it has a higher sampling frequency (recommended by RFC 1323), + * - the RTO does not collapse into RTT due to RTTVAR going towards zero, + * - it is simple (cf. more complex proposals such as Eifel timer or research + * which suggests that the gain should be set according to window size), + * - in tests it was found to work well with CCID2 [gerrit]. + */ +static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) +{ + struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); + long m = mrtt ? : 1; + + if (hctx->srtt == 0) { + /* First measurement m */ + hctx->srtt = m << 3; + hctx->mdev = m << 1; + + hctx->mdev_max = max(TCP_RTO_MIN, hctx->mdev); + hctx->rttvar = hctx->mdev_max; + hctx->rtt_seq = dccp_sk(sk)->dccps_gss; + } else { + /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ + m -= (hctx->srtt >> 3); + hctx->srtt += m; + + /* Similarly, update scaled mdev with regard to |m| */ + if (m < 0) { + m = -m; + m -= (hctx->mdev >> 2); + /* + * This neutralises RTO increase when RTT < SRTT - mdev + * (see P. Sarolahti, A. Kuznetsov,"Congestion Control + * in Linux TCP", USENIX 2002, pp. 49-62). + */ + if (m > 0) + m >>= 3; + } else { + m -= (hctx->mdev >> 2); + } + hctx->mdev += m; + + if (hctx->mdev > hctx->mdev_max) { + hctx->mdev_max = hctx->mdev; + if (hctx->mdev_max > hctx->rttvar) + hctx->rttvar = hctx->mdev_max; + } + + /* + * Decay RTTVAR at most once per flight, exploiting that + * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) + * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) + * GAR is a useful bound for FlightSize = pipe, AWL is probably + * too low as it over-estimates pipe. + */ + if (after48(dccp_sk(sk)->dccps_gar, hctx->rtt_seq)) { + if (hctx->mdev_max < hctx->rttvar) + hctx->rttvar -= (hctx->rttvar - + hctx->mdev_max) >> 2; + hctx->rtt_seq = dccp_sk(sk)->dccps_gss; + hctx->mdev_max = TCP_RTO_MIN; + } + } + + /* + * Set RTO from SRTT and RTTVAR + * Clock granularity is ignored since the minimum error for RTTVAR is + * clamped to 50msec (corresponding to HZ=20). This leads to a minimum + * RTO of 200msec. This agrees with TCP and RFC 4341, 5.: "Because DCCP + * does not retransmit data, DCCP does not require TCP's recommended + * minimum timeout of one second". + */ + hctx->rto = (hctx->srtt >> 3) + hctx->rttvar; + + if (hctx->rto > DCCP_RTO_MAX) + hctx->rto = DCCP_RTO_MAX; +} + +static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, + unsigned int *maxincr) { struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); @@ -295,64 +364,15 @@ static inline void ccid2_new_ack(struct sock *sk, hctx->cwnd += 1; hctx->packets_acked = 0; } - - /* update RTO */ - if (hctx->srtt == -1 || - time_after(jiffies, hctx->lastrtt + hctx->srtt)) { - unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; - int s; - - /* first measurement */ - if (hctx->srtt == -1) { - ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n", - r, jiffies, - (unsigned long long)seqp->ccid2s_seq); - ccid2_change_srtt(hctx, r); - hctx->rttvar = r >> 1; - } else { - /* RTTVAR */ - long tmp = hctx->srtt - r; - long srtt; - - if (tmp < 0) - tmp *= -1; - - tmp >>= 2; - hctx->rttvar *= 3; - hctx->rttvar >>= 2; - hctx->rttvar += tmp; - - /* SRTT */ - srtt = hctx->srtt; - srtt *= 7; - srtt >>= 3; - tmp = r >> 3; - srtt += tmp; - ccid2_change_srtt(hctx, srtt); - } - s = hctx->rttvar << 2; - /* clock granularity is 1 when based on jiffies */ - if (!s) - s = 1; - hctx->rto = hctx->srtt + s; - - /* must be at least a second */ - s = hctx->rto / HZ; - /* DCCP doesn't require this [but I like it cuz my code sux] */ -#if 1 - if (s < 1) - hctx->rto = HZ; -#endif - /* max 60 seconds */ - if (s > 60) - hctx->rto = HZ * 60; - - hctx->lastrtt = jiffies; - - ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", - hctx->srtt, hctx->rttvar, - hctx->rto, HZ, r); - } + /* + * FIXME: RTT is sampled several times per acknowledgment (for each + * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). + * This causes the RTT to be over-estimated, since the older entries + * in the Ack Vector have earlier sending times. + * The cleanest solution is to not use the ccid2s_sent field at all + * and instead use DCCP timestamps - need to be resolved at some time. + */ + ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent); } static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) @@ -579,8 +599,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) if (hctx->pipe == 0) sk_stop_timer(sk, &hctx->rtotimer); else - sk_reset_timer(sk, &hctx->rtotimer, - jiffies + hctx->rto); + sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); done: /* check if incoming Acks allow pending packets to be sent */ if (sender_was_blocked && !ccid2_cwnd_network_limited(hctx)) @@ -613,9 +632,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) if (ccid2_hc_tx_alloc_seq(hctx)) return -ENOMEM; - hctx->rto = 3 * HZ; - ccid2_change_srtt(hctx, -1); - hctx->rttvar = -1; + hctx->rto = DCCP_TIMEOUT_INIT; hctx->rpdupack = -1; hctx->last_cong = jiffies; setup_timer(&hctx->rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index d5900dd5d4f4..8b7a2dee2f6d 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -44,7 +44,12 @@ struct ccid2_seq { * * @{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 * @packets_acked: Ack counter for deriving cwnd growth (RFC 3465) - * @lastrtt: time RTT was last measured + * @srtt: smoothed RTT estimate, scaled by 2^3 + * @mdev: smoothed RTT variation, scaled by 2^2 + * @mdev_max: maximum of @mdev during one flight + * @rttvar: moving average/maximum of @mdev_max + * @rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) + * @rtt_seq: to decay RTTVAR at most once per flight * @rpseq: last consecutive seqno * @rpdupack: dupacks since rpseq * @av_chunks: list of Ack Vectors received on current skb @@ -58,10 +63,13 @@ struct ccid2_hc_tx_sock { int seqbufc; struct ccid2_seq *seqh; struct ccid2_seq *seqt; - long rto; - long srtt; - long rttvar; - unsigned long lastrtt; + /* RTT measurement: variables/principles are the same as in TCP */ + u32 srtt, + mdev, + mdev_max, + rttvar, + rto; + u64 rtt_seq:48; struct timer_list rtotimer; u64 rpseq; int rpdupack; From 20bbd0f75ee4b72c1dafc8e5fb6ad39ba506a75c Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 082/105] dccp ccid-2: Remove wrappers around sk_{reset,stop}_timer() This removes the wrappers around the sk timer functions as it makes the code clearer and not much is gained from using wrappers: the BUG_ON in start_rto_timer will never trigger since that function was called only when * the RTO timer expired (rto_expire, and then timer_pending() is false); * in tx_packet_sent only if !timer_pending() (BUG_ON is redundant here); * previously in new_ack, after stopping the timer (timer_pending() false). One further motive behind this patch is to replace the RTO timer with the icsk retransmission timer, as it is already part of the DCCP socket. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid2.c | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 22753fd98698..c539f79ab8e8 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -110,8 +110,6 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) dp->dccps_l_ack_ratio = val; } -static void ccid2_start_rto_timer(struct sock *sk); - static void ccid2_hc_tx_rto_expire(unsigned long data) { struct sock *sk = (struct sock *)data; @@ -150,23 +148,13 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) /* if we were blocked before, we may now send cwnd=1 packet */ if (sender_was_blocked) tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet); - ccid2_start_rto_timer(sk); + /* restart backed-off timer */ + sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); out: bh_unlock_sock(sk); sock_put(sk); } -static void ccid2_start_rto_timer(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - - ccid2_pr_debug("setting RTO timeout=%ld\n", hctx->rto); - - BUG_ON(timer_pending(&hctx->rtotimer)); - sk_reset_timer(sk, &hctx->rtotimer, - jiffies + hctx->rto); -} - static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); @@ -245,7 +233,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) /* setup RTO timer */ if (!timer_pending(&hctx->rtotimer)) - ccid2_start_rto_timer(sk); + sk_reset_timer(sk, &hctx->rtotimer, jiffies + hctx->rto); #ifdef CONFIG_IP_DCCP_CCID2_DEBUG do { @@ -262,14 +250,6 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) #endif } -static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); - - sk_stop_timer(sk, &hctx->rtotimer); - ccid2_pr_debug("deleted RTO timer\n"); -} - /** * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm * This code is almost identical with TCP's tcp_rtt_estimator(), since @@ -645,7 +625,7 @@ static void ccid2_hc_tx_exit(struct sock *sk) struct ccid2_hc_tx_sock *hctx = ccid2_hc_tx_sk(sk); int i; - ccid2_hc_tx_kill_rto_timer(sk); + sk_stop_timer(sk, &hctx->rtotimer); for (i = 0; i < hctx->seqbufc; i++) kfree(hctx->seqbuf[i]); From b25b0c60b0c39a82bc651aeb6443bcb36cd17f76 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 083/105] dccp: Combine the functionality of enqeueing and cloning Realising the following call pattern, * first dccp_entail() is called to enqueue a new skb and * then skb_clone() is called to transmit a clone of that skb, this patch integrates both interrelated steps into dccp_entail(). Note: the return value of skb_clone is not checked. It may be an idea to add a warning if this occurs. In both instances, however, a timer is set for retransmission, so that cloning is re-tried via dccp_retransmit_skb(). Signed-off-by: Gerrit Renker --- net/dccp/output.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/dccp/output.c b/net/dccp/output.c index 39056dc61355..b1eaf7bcfb11 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -26,11 +26,13 @@ static inline void dccp_event_ack_sent(struct sock *sk) inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } -static void dccp_skb_entail(struct sock *sk, struct sk_buff *skb) +/* enqueue @skb on sk_send_head for retransmission, return clone to send now */ +static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb) { skb_set_owner_w(skb, sk); WARN_ON(sk->sk_send_head); sk->sk_send_head = skb; + return skb_clone(sk->sk_send_head, gfp_any()); } /* @@ -550,8 +552,7 @@ int dccp_connect(struct sock *sk) DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST; - dccp_skb_entail(sk, skb); - dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); + dccp_transmit_skb(sk, dccp_skb_entail(sk, skb)); DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); /* Timer for repeating the REQUEST until an answer. */ @@ -676,8 +677,7 @@ void dccp_send_close(struct sock *sk, const int active) DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE; if (active) { - dccp_skb_entail(sk, skb); - dccp_transmit_skb(sk, skb_clone(skb, prio)); + skb = dccp_skb_entail(sk, skb); /* * Retransmission timer for active-close: RFC 4340, 8.3 requires * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ @@ -690,6 +690,6 @@ void dccp_send_close(struct sock *sk, const int active) */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, DCCP_TIMEOUT_INIT, DCCP_RTO_MAX); - } else - dccp_transmit_skb(sk, skb); + } + dccp_transmit_skb(sk, skb); } From 6224877b2ca4be5de96270a8ae490fe2ba11b0e0 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 084/105] tcp/dccp: Consolidate common code for RFC 3390 conversion This patch consolidates the code common to TCP and CCID-2: * TCP uses RFC 3390 in a packet-oriented manner (tcp_input.c) and * CCID-2 uses RFC 3390 in packet-oriented manner (RFC 4341). Signed-off-by: Gerrit Renker --- include/net/tcp.h | 15 +++++++++++++++ net/dccp/ccids/ccid2.c | 8 ++------ net/ipv4/tcp_input.c | 17 ++--------------- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 8983386356a5..6bc4b8148ca0 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -782,6 +782,21 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) /* Use define here intentionally to get WARN_ON location shown at the caller */ #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) +/* + * Convert RFC3390 larger initial windows into an equivalent number of packets. + * + * John Heffner states: + * + * The RFC specifies a window of no more than 4380 bytes + * unless 2*MSS > 4380. Reading the pseudocode in the RFC + * is a bit misleading because they use a clamp at 4380 bytes + * rather than a multiplier in the relevant range. + */ +static inline u32 rfc3390_bytes_to_packets(const u32 bytes) +{ + return bytes <= 1095 ? 4 : (bytes > 1460 ? 2 : 3); +} + extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh); extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index c539f79ab8e8..fa713227c66f 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -596,12 +596,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ hctx->ssthresh = ~0U; - /* - * RFC 4341, 5: "The cwnd parameter is initialized to at most four - * packets for new connections, following the rules from [RFC3390]". - * We need to convert the bytes of RFC3390 into the packets of RFC 4341. - */ - hctx->cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U); + /* Use larger initial windows (RFC 3390, rfc2581bis) */ + hctx->cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); /* Make sure that Ack Ratio is enabled and within bounds. */ max_ratio = DIV_ROUND_UP(hctx->cwnd, 2); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 67ccce2a96bd..16d0040de34d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -811,25 +811,12 @@ void tcp_update_metrics(struct sock *sk) } } -/* Numbers are taken from RFC3390. - * - * John Heffner states: - * - * The RFC specifies a window of no more than 4380 bytes - * unless 2*MSS > 4380. Reading the pseudocode in the RFC - * is a bit misleading because they use a clamp at 4380 bytes - * rather than use a multiplier in the relevant range. - */ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); - if (!cwnd) { - if (tp->mss_cache > 1460) - cwnd = 2; - else - cwnd = (tp->mss_cache > 1095) ? 3 : 4; - } + if (!cwnd) + cwnd = rfc3390_bytes_to_packets(tp->mss_cache); return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } From ddab05568eaa70fc92b2aae957136f188f724e9c Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 085/105] dccp: Clean up slow-path input processing This patch rearranges the order of statements of the slow-path input processing (i.e. any other state than OPEN), to resolve the following issues. 1. Dependencies: the order of statements now better matches RFC 4340, 8.5, i.e. step 7 is before step 9 (previously 9 was before 7), and parsing options in step 8 (which can consume resources) now comes after step 7. 2. Bug-fix: in state CLOSED, there should not be any sequence number checking or option processing. This is why the test for CLOSED has been moved after the test for LISTEN. 3. As before sequence number checks are omitted if in state LISTEN/REQUEST, due to the note underneath the table in RFC 4340, 7.5.3. 4. Packets are now passed on to Ack Vector / CCID processing only after - step 7 (receive unexpected packets), - step 9 (receive Reset), - step 13 (receive CloseReq), - step 14 (receive Close) and only if the state is PARTOPEN. This simplifies CCID processing: - in LISTEN/CLOSED the CCIDs are non-existent; - in RESPOND/REQUEST the CCIDs have not yet been negotiated; - in CLOSEREQ and active-CLOSING the node has already closed this socket; - in passive-CLOSING the client is waiting for its Reset. In the last case, RFC 4340, 8.3 leaves it open to ignore further incoming data, which is the approach taken here. As a result of (3), CCID processing is now indeed confined to OPEN/PARTOPEN states, i.e. congestion control is performed only on the flow of data packets. This avoids pathological cases of doing congestion control on those messages which set up and terminate the connection. I have done a few checks to see if this creates a problem in other parts of the code. This seems not to be the case; even if there were one, it would be better to fix it than to perform congestion control on Close/Request/Response messages. Similarly for Ack Vectors (as they depend on the negotiated CCID). Signed-off-by: Gerrit Renker --- net/dccp/input.c | 68 +++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/net/dccp/input.c b/net/dccp/input.c index 9a108ce17fc7..b1e38bf94456 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -603,22 +603,36 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Caller (dccp_v4_do_rcv) will send Reset */ dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; return 1; + } else if (sk->sk_state == DCCP_CLOSED) { + dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; + return 1; } - if (sk->sk_state != DCCP_REQUESTING && sk->sk_state != DCCP_RESPOND) { - if (dccp_check_seqno(sk, skb)) - goto discard; + /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */ + if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb)) + goto discard; - /* - * Step 8: Process options and mark acknowledgeable - */ - if (dccp_parse_options(sk, NULL, skb)) - return 1; - - dccp_handle_ackvec_processing(sk, skb); - dccp_deliver_input_to_ccids(sk, skb); + /* + * Step 7: Check for unexpected packet types + * If (S.is_server and P.type == Response) + * or (S.is_client and P.type == Request) + * or (S.state == RESPOND and P.type == Data), + * Send Sync packet acknowledging P.seqno + * Drop packet and return + */ + if ((dp->dccps_role != DCCP_ROLE_CLIENT && + dh->dccph_type == DCCP_PKT_RESPONSE) || + (dp->dccps_role == DCCP_ROLE_CLIENT && + dh->dccph_type == DCCP_PKT_REQUEST) || + (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) { + dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); + goto discard; } + /* Step 8: Process options */ + if (dccp_parse_options(sk, NULL, skb)) + return 1; + /* * Step 9: Process Reset * If P.type == Reset, @@ -626,41 +640,21 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * S.state := TIMEWAIT * Set TIMEWAIT timer * Drop packet and return - */ + */ if (dh->dccph_type == DCCP_PKT_RESET) { dccp_rcv_reset(sk, skb); return 0; - /* - * Step 7: Check for unexpected packet types - * If (S.is_server and P.type == Response) - * or (S.is_client and P.type == Request) - * or (S.state == RESPOND and P.type == Data), - * Send Sync packet acknowledging P.seqno - * Drop packet and return - */ - } else if ((dp->dccps_role != DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_RESPONSE) || - (dp->dccps_role == DCCP_ROLE_CLIENT && - dh->dccph_type == DCCP_PKT_REQUEST) || - (sk->sk_state == DCCP_RESPOND && - dh->dccph_type == DCCP_PKT_DATA)) { - dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC); - goto discard; - } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { + } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */ if (dccp_rcv_closereq(sk, skb)) return 0; goto discard; - } else if (dh->dccph_type == DCCP_PKT_CLOSE) { + } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */ if (dccp_rcv_close(sk, skb)) return 0; goto discard; } switch (sk->sk_state) { - case DCCP_CLOSED: - dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; - return 1; - case DCCP_REQUESTING: queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); if (queued >= 0) @@ -669,8 +663,12 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, __kfree_skb(skb); return 0; - case DCCP_RESPOND: case DCCP_PARTOPEN: + /* Step 8: if using Ack Vectors, mark packet acknowledgeable */ + dccp_handle_ackvec_processing(sk, skb); + dccp_deliver_input_to_ccids(sk, skb); + /* fall through */ + case DCCP_RESPOND: queued = dccp_rcv_respond_partopen_state_process(sk, skb, dh, len); break; From d6da3511d6b558d0b017777b61dc08b8fbc06ea4 Mon Sep 17 00:00:00 2001 From: Tomasz Grobelny Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 086/105] dccp: Policy-based packet dequeueing infrastructure This patch adds a generic infrastructure for policy-based dequeueing of TX packets and provides two policies: * a simple FIFO policy (which is the default) and * a priority based policy (set via socket options). Both policies honour the tx_qlen sysctl for the maximum size of the write queue (can be overridden via socket options). The priority policy uses skb->priority internally to assign an u32 priority identifier, using the same ranking as SO_PRIORITY. The skb->priority field is set to 0 when the packet leaves DCCP. The priority is supplied as ancillary data using cmsg(3), the patch also provides the requisite parsing routines. Signed-off-by: Tomasz Grobelny Signed-off-by: Gerrit Renker --- Documentation/networking/dccp.txt | 19 +++++ include/linux/dccp.h | 21 +++++ net/dccp/Makefile | 2 +- net/dccp/dccp.h | 12 +++ net/dccp/output.c | 7 +- net/dccp/proto.c | 67 +++++++++++++++- net/dccp/qpolicy.c | 126 ++++++++++++++++++++++++++++++ 7 files changed, 246 insertions(+), 8 deletions(-) create mode 100644 net/dccp/qpolicy.c diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index b132e4a3cf0f..fcfc12534428 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -45,6 +45,25 @@ http://linux-net.osdl.org/index.php/DCCP_Testing#Experimental_DCCP_source_tree Socket options ============== +DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes +a policy ID as argument and can only be set before the connection (i.e. changes +during an established connection are not supported). Currently, two policies are +defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special, +and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an +u32 priority value as ancillary data to sendmsg(), where higher numbers indicate +a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to +be formatted using a cmsg(3) message header filled in as follows: + cmsg->cmsg_level = SOL_DCCP; + cmsg->cmsg_type = DCCP_SCM_PRIORITY; + cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */ + +DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero +value is always interpreted as unbounded queue length. If different from zero, +the interpretation of this parameter depends on the current dequeuing policy +(see above): the "simple" policy will enforce a fixed queue size by returning +EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the +lowest-priority packet first. The default value for this parameter is +initialised from /proc/sys/net/dccp/default/tx_qlen. DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, diff --git a/include/linux/dccp.h b/include/linux/dccp.h index eed52bcd35d0..010e2d87ed75 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -197,6 +197,21 @@ enum dccp_feature_numbers { DCCPF_MAX_CCID_SPECIFIC = 255, }; +/* DCCP socket control message types for cmsg */ +enum dccp_cmsg_type { + DCCP_SCM_PRIORITY = 1, + DCCP_SCM_QPOLICY_MAX = 0xFFFF, + /* ^-- Up to here reserved exclusively for qpolicy parameters */ + DCCP_SCM_MAX +}; + +/* DCCP priorities for outgoing/queued packets */ +enum dccp_packet_dequeueing_policy { + DCCPQ_POLICY_SIMPLE, + DCCPQ_POLICY_PRIO, + DCCPQ_POLICY_MAX +}; + /* DCCP socket options */ #define DCCP_SOCKOPT_PACKET_SIZE 1 /* XXX deprecated, without effect */ #define DCCP_SOCKOPT_SERVICE 2 @@ -210,6 +225,8 @@ enum dccp_feature_numbers { #define DCCP_SOCKOPT_CCID 13 #define DCCP_SOCKOPT_TX_CCID 14 #define DCCP_SOCKOPT_RX_CCID 15 +#define DCCP_SOCKOPT_QPOLICY_ID 16 +#define DCCP_SOCKOPT_QPOLICY_TXQLEN 17 #define DCCP_SOCKOPT_CCID_RX_INFO 128 #define DCCP_SOCKOPT_CCID_TX_INFO 192 @@ -458,6 +475,8 @@ struct dccp_ackvec; * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) * @dccps_options_received - parsed set of retrieved options + * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy + * @dccps_tx_qlen - maximum length of the TX queue * @dccps_role - role of this sock, one of %dccp_role * @dccps_hc_rx_insert_options - receiver wants to add options when acking * @dccps_hc_tx_insert_options - sender wants to add options when sending @@ -500,6 +519,8 @@ struct dccp_sock { struct ccid *dccps_hc_rx_ccid; struct ccid *dccps_hc_tx_ccid; struct dccp_options_received dccps_options_received; + __u8 dccps_qpolicy; + __u32 dccps_tx_qlen; enum dccp_role dccps_role:2; __u8 dccps_hc_rx_insert_options:1; __u8 dccps_hc_tx_insert_options:1; diff --git a/net/dccp/Makefile b/net/dccp/Makefile index b68440bd7fa2..0c1c9af2bf7e 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -1,7 +1,7 @@ obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o dccp-y := ccid.o feat.o input.o minisocks.o options.o \ - output.o proto.o timer.o ackvec.o + qpolicy.o output.o proto.o timer.o ackvec.o dccp_ipv4-y := ipv4.o diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 74c90cd27677..ce2dd6f6f34d 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -234,6 +234,18 @@ extern void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, extern void dccp_send_sync(struct sock *sk, const u64 seq, const enum dccp_pkt_type pkt_type); +/* + * TX Packet Dequeueing Interface + */ +extern void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb); +extern bool dccp_qpolicy_full(struct sock *sk); +extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); +extern struct sk_buff *dccp_qpolicy_top(struct sock *sk); +extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk); + +/* + * TX Packet Output and TX Timers + */ extern void dccp_write_xmit(struct sock *sk); extern void dccp_write_space(struct sock *sk); extern void dccp_flush_write_queue(struct sock *sk, long *time_budget); diff --git a/net/dccp/output.c b/net/dccp/output.c index b1eaf7bcfb11..2532797a8009 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -241,7 +241,7 @@ static void dccp_xmit_packet(struct sock *sk) { int err, len; struct dccp_sock *dp = dccp_sk(sk); - struct sk_buff *skb = skb_dequeue(&sk->sk_write_queue); + struct sk_buff *skb = dccp_qpolicy_pop(sk); if (unlikely(skb == NULL)) return; @@ -344,7 +344,7 @@ void dccp_write_xmit(struct sock *sk) struct dccp_sock *dp = dccp_sk(sk); struct sk_buff *skb; - while ((skb = skb_peek(&sk->sk_write_queue))) { + while ((skb = dccp_qpolicy_top(sk))) { int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb); switch (ccid_packet_dequeue_eval(rc)) { @@ -358,8 +358,7 @@ void dccp_write_xmit(struct sock *sk) dccp_xmit_packet(sk); break; case CCID_PACKET_ERR: - skb_dequeue(&sk->sk_write_queue); - kfree_skb(skb); + dccp_qpolicy_drop(sk, skb); dccp_pr_debug("packet discarded due to err=%d\n", rc); } } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 8c125ffab1c5..b56efdd2a421 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -189,6 +189,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) dp->dccps_rate_last = jiffies; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; + dp->dccps_tx_qlen = sysctl_dccp_tx_qlen; dccp_init_xmit_timers(sk); @@ -541,6 +542,20 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_RECV_CSCOV: err = dccp_setsockopt_cscov(sk, val, true); break; + case DCCP_SOCKOPT_QPOLICY_ID: + if (sk->sk_state != DCCP_CLOSED) + err = -EISCONN; + else if (val < 0 || val >= DCCPQ_POLICY_MAX) + err = -EINVAL; + else + dp->dccps_qpolicy = val; + break; + case DCCP_SOCKOPT_QPOLICY_TXQLEN: + if (val < 0) + err = -EINVAL; + else + dp->dccps_tx_qlen = val; + break; default: err = -ENOPROTOOPT; break; @@ -648,6 +663,12 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_RECV_CSCOV: val = dp->dccps_pcrlen; break; + case DCCP_SOCKOPT_QPOLICY_ID: + val = dp->dccps_qpolicy; + break; + case DCCP_SOCKOPT_QPOLICY_TXQLEN: + val = dp->dccps_tx_qlen; + break; case 128 ... 191: return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname, len, (u32 __user *)optval, optlen); @@ -690,6 +711,43 @@ int compat_dccp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL_GPL(compat_dccp_getsockopt); #endif +static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) +{ + struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg); + + /* + * Assign an (opaque) qpolicy priority value to skb->priority. + * + * We are overloading this skb field for use with the qpolicy subystem. + * The skb->priority is normally used for the SO_PRIORITY option, which + * is initialised from sk_priority. Since the assignment of sk_priority + * to skb->priority happens later (on layer 3), we overload this field + * for use with queueing priorities as long as the skb is on layer 4. + * The default priority value (if nothing is set) is 0. + */ + skb->priority = 0; + + for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) { + + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_DCCP) + continue; + + switch (cmsg->cmsg_type) { + case DCCP_SCM_PRIORITY: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) + return -EINVAL; + skb->priority = *(__u32 *)CMSG_DATA(cmsg); + break; + default: + return -EINVAL; + } + } + return 0; +} + int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { @@ -705,8 +763,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, lock_sock(sk); - if (sysctl_dccp_tx_qlen && - (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) { + if (dccp_qpolicy_full(sk)) { rc = -EAGAIN; goto out_release; } @@ -734,7 +791,11 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (rc != 0) goto out_discard; - skb_queue_tail(&sk->sk_write_queue, skb); + rc = dccp_msghdr_parse(msg, skb); + if (rc != 0) + goto out_discard; + + dccp_qpolicy_push(sk, skb); dccp_write_xmit(sk); out_release: release_sock(sk); diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c new file mode 100644 index 000000000000..414696b0d830 --- /dev/null +++ b/net/dccp/qpolicy.c @@ -0,0 +1,126 @@ +/* + * net/dccp/qpolicy.c + * + * Policy-based packet dequeueing interface for DCCP. + * + * Copyright (c) 2008 Tomasz Grobelny + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License v2 + * as published by the Free Software Foundation. + */ +#include "dccp.h" + +/* + * Simple Dequeueing Policy: + * If tx_qlen is different from 0, enqueue up to tx_qlen elements. + */ +static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb) +{ + skb_queue_tail(&sk->sk_write_queue, skb); +} + +static bool qpolicy_simple_full(struct sock *sk) +{ + return dccp_sk(sk)->dccps_tx_qlen && + sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen; +} + +static struct sk_buff *qpolicy_simple_top(struct sock *sk) +{ + return skb_peek(&sk->sk_write_queue); +} + +/* + * Priority-based Dequeueing Policy: + * If tx_qlen is different from 0 and the queue has reached its upper bound + * of tx_qlen elements, replace older packets lowest-priority-first. + */ +static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk) +{ + struct sk_buff *skb, *best = NULL; + + skb_queue_walk(&sk->sk_write_queue, skb) + if (best == NULL || skb->priority > best->priority) + best = skb; + return best; +} + +static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk) +{ + struct sk_buff *skb, *worst = NULL; + + skb_queue_walk(&sk->sk_write_queue, skb) + if (worst == NULL || skb->priority < worst->priority) + worst = skb; + return worst; +} + +static bool qpolicy_prio_full(struct sock *sk) +{ + if (qpolicy_simple_full(sk)) + dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk)); + return false; +} + +/** + * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface + * @push: add a new @skb to the write queue + * @full: indicates that no more packets will be admitted + * @top: peeks at whatever the queueing policy defines as its `top' + */ +static struct dccp_qpolicy_operations { + void (*push) (struct sock *sk, struct sk_buff *skb); + bool (*full) (struct sock *sk); + struct sk_buff* (*top) (struct sock *sk); + +} qpol_table[DCCPQ_POLICY_MAX] = { + [DCCPQ_POLICY_SIMPLE] = { + .push = qpolicy_simple_push, + .full = qpolicy_simple_full, + .top = qpolicy_simple_top, + }, + [DCCPQ_POLICY_PRIO] = { + .push = qpolicy_simple_push, + .full = qpolicy_prio_full, + .top = qpolicy_prio_best_skb, + }, +}; + +/* + * Externally visible interface + */ +void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb) +{ + qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb); +} + +bool dccp_qpolicy_full(struct sock *sk) +{ + return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk); +} + +void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb) +{ + if (skb != NULL) { + skb_unlink(skb, &sk->sk_write_queue); + kfree_skb(skb); + } +} + +struct sk_buff *dccp_qpolicy_top(struct sock *sk) +{ + return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk); +} + +struct sk_buff *dccp_qpolicy_pop(struct sock *sk) +{ + struct sk_buff *skb = dccp_qpolicy_top(sk); + + /* Clear any skb fields that we used internally */ + skb->priority = 0; + + if (skb) + skb_unlink(skb, &sk->sk_write_queue); + return skb; +} From 7d1af6a8d935678248d057564e75e1452409a53c Mon Sep 17 00:00:00 2001 From: Tomasz Grobelny Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 087/105] dccp qpolicy: Parameter checking of cmsg qpolicy parameters Ensure that cmsg->cmsg_type value is valid for qpolicy that is currently in use. Signed-off-by: Tomasz Grobelny Signed-off-by: Gerrit Renker --- net/dccp/dccp.h | 1 + net/dccp/proto.c | 4 ++++ net/dccp/qpolicy.c | 23 +++++++++++++++++------ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index ce2dd6f6f34d..1585fa26488c 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -242,6 +242,7 @@ extern bool dccp_qpolicy_full(struct sock *sk); extern void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb); extern struct sk_buff *dccp_qpolicy_top(struct sock *sk); extern struct sk_buff *dccp_qpolicy_pop(struct sock *sk); +extern bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param); /* * TX Packet Output and TX Timers diff --git a/net/dccp/proto.c b/net/dccp/proto.c index b56efdd2a421..a3caa11aa836 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -735,6 +735,10 @@ static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb) if (cmsg->cmsg_level != SOL_DCCP) continue; + if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX && + !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type)) + return -EINVAL; + switch (cmsg->cmsg_type) { case DCCP_SCM_PRIORITY: if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32))) diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c index 414696b0d830..27383f88c75f 100644 --- a/net/dccp/qpolicy.c +++ b/net/dccp/qpolicy.c @@ -73,17 +73,20 @@ static struct dccp_qpolicy_operations { void (*push) (struct sock *sk, struct sk_buff *skb); bool (*full) (struct sock *sk); struct sk_buff* (*top) (struct sock *sk); + __be32 params; } qpol_table[DCCPQ_POLICY_MAX] = { [DCCPQ_POLICY_SIMPLE] = { - .push = qpolicy_simple_push, - .full = qpolicy_simple_full, - .top = qpolicy_simple_top, + .push = qpolicy_simple_push, + .full = qpolicy_simple_full, + .top = qpolicy_simple_top, + .params = 0, }, [DCCPQ_POLICY_PRIO] = { - .push = qpolicy_simple_push, - .full = qpolicy_prio_full, - .top = qpolicy_prio_best_skb, + .push = qpolicy_simple_push, + .full = qpolicy_prio_full, + .top = qpolicy_prio_best_skb, + .params = DCCP_SCM_PRIORITY, }, }; @@ -124,3 +127,11 @@ struct sk_buff *dccp_qpolicy_pop(struct sock *sk) skb_unlink(skb, &sk->sk_write_queue); return skb; } + +bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param) +{ + /* check if exactly one bit is set */ + if (!param || (param & (param - 1))) + return false; + return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param; +} From f76fd327a8b32d3ad5b51639faf6f54d18be0981 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 088/105] dccp ccid-3: Runtime verification of timer resolution The DCCP base time resolution is 10 microseconds (RFC 4340, 13.1 ... 13.3). Using a timer with a lower resolution was found to trigger the following bug warnings/problems on high-speed networks (e.g. local loopback): * RTT samples are rounded down to 0 if below resolution; * in some cases, negative RTT samples were observed; * the CCID-3 feedback timer complains that the feedback interval is 0, since the feedback interval is in the order of 1 RTT or less and RTT measurement rounded this down to 0; On an Intel computer this will for instance happen when using a boot-time parameter of "clocksource=jiffies". The following system log messages were observed: 11:24:00 kernel: BUG: delta (0) <= 0 at ccid3_hc_rx_send_feedback() 11:26:12 kernel: BUG: delta (0) <= 0 at ccid3_hc_rx_send_feedback() 11:26:30 kernel: dccp_sample_rtt: unusable RTT sample 0, using min 11:26:30 last message repeated 5 times This patch defines a global constant for the time resolution, adds this in timer.c, and checks the available clock resolution at CCID-3 module load time. When the resolution is worse than 10 microseconds, module loading exits with a message "socket type not supported". Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 13 +++++++++++++ net/dccp/dccp.h | 5 ++++- net/dccp/timer.c | 3 +-- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 0d406f82e433..f566eb76aeb2 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -869,6 +869,19 @@ MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); static __init int ccid3_module_init(void) { + struct timespec tp; + + /* + * Without a fine-grained clock resolution, RTTs/X_recv are not sampled + * correctly and feedback is sent either too early or too late. + */ + hrtimer_get_res(CLOCK_MONOTONIC, &tp); + if (tp.tv_sec || tp.tv_nsec > DCCP_TIME_RESOLUTION * NSEC_PER_USEC) { + printk(KERN_ERR "%s: Timer too coarse (%ld usec), need %u-usec" + " resolution - check your clocksource.\n", __func__, + tp.tv_nsec/NSEC_PER_USEC, DCCP_TIME_RESOLUTION); + return -ESOCKTNOSUPPORT; + } return ccid_register(&ccid3); } module_init(ccid3_module_init); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 1585fa26488c..b63a82ccb2b2 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -86,10 +86,13 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); */ #define DCCP_RTO_MAX ((unsigned)(64 * HZ)) +/* DCCP base time resolution - 10 microseconds (RFC 4340, 13.1 ... 13.3) */ +#define DCCP_TIME_RESOLUTION 10 + /* * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4 */ -#define DCCP_SANE_RTT_MIN 100 +#define DCCP_SANE_RTT_MIN (10 * DCCP_TIME_RESOLUTION) #define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5) #define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC) diff --git a/net/dccp/timer.c b/net/dccp/timer.c index e02d5a94f4c0..16359e29e7f5 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -281,8 +281,7 @@ u32 dccp_timestamp(void) { s64 delta = ktime_us_delta(ktime_get_real(), dccp_timestamp_seed); - do_div(delta, 10); - return delta; + return div_u64(delta, DCCP_TIME_RESOLUTION); } EXPORT_SYMBOL_GPL(dccp_timestamp); From d0c05fe4448db5cbdd886186860581f736f59ae9 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 089/105] dccp ccid-3: Simplified handling of TX states Since CCIDs are only used during the established phase of a connection, they have very little internal state; this specifically reduces to: * "no packet sent" if and only if s == 0, for the TX packet size s; * when the first packet has been sent (i.e. `s' > 0), the question is whether or not feedback has been received: - if a feedback packet is received, "feedback = yes" is set, - if the nofeedback timer expires, "feedback = no" is set. Thus the CCID only needs to remember state about whether or not feedback has been received. This is now implemented using a boolean flag, which is toggled when a feedback packet arrives or the nofeedback timer expires. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 41 ++++++----------------------------------- net/dccp/ccids/ccid3.h | 11 ++--------- 2 files changed, 8 insertions(+), 44 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index f566eb76aeb2..5470a978be02 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -49,31 +49,6 @@ static int ccid3_debug; /* * Transmitter Half-Connection Routines */ -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) -{ - static char *ccid3_state_names[] = { - [TFRC_SSTATE_NO_SENT] = "NO_SENT", - [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", - [TFRC_SSTATE_FBACK] = "FBACK", - }; - - return ccid3_state_names[state]; -} -#endif - -static void ccid3_hc_tx_set_state(struct sock *sk, - enum ccid3_hc_tx_states state) -{ - struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - enum ccid3_hc_tx_states oldstate = hctx->state; - - ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", - dccp_role(sk), sk, ccid3_tx_state_name(oldstate), - ccid3_tx_state_name(state)); - WARN_ON(state == oldstate); - hctx->state = state; -} /* * Compute the initial sending rate X_init in the manner of RFC 3390: @@ -206,16 +181,15 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) goto restart_timer; } - ccid3_pr_debug("%s(%p, state=%s) - entry \n", dccp_role(sk), sk, - ccid3_tx_state_name(hctx->state)); + ccid3_pr_debug("%s(%p) entry with%s feedback\n", dccp_role(sk), sk, + hctx->feedback ? "" : "out"); /* Ignore and do not restart after leaving the established state */ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) goto out; /* Reset feedback state to "no feedback received" */ - if (hctx->state == TFRC_SSTATE_FBACK) - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); + hctx->feedback = false; /* * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 @@ -290,7 +264,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) if (unlikely(skb->len == 0)) return -EBADMSG; - if (hctx->state == TFRC_SSTATE_NO_SENT) { + if (hctx->s == 0) { sk_reset_timer(sk, &hctx->no_feedback_timer, (jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); hctx->last_win_count = 0; @@ -324,8 +298,6 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) } ccid3_update_send_interval(hctx); - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - } else { delay = ktime_us_delta(hctx->t_nom, now); ccid3_pr_debug("delay=%ld\n", (long)delay); @@ -396,8 +368,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 */ - if (hctx->state == TFRC_SSTATE_NO_FBACK) { - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); + if (!hctx->feedback) { + hctx->feedback = true; if (hctx->t_rto == 0) { /* @@ -502,7 +474,6 @@ static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) { struct ccid3_hc_tx_sock *hctx = ccid_priv(ccid); - hctx->state = TFRC_SSTATE_NO_SENT; hctx->hist = NULL; setup_timer(&hctx->no_feedback_timer, ccid3_hc_tx_no_feedback_timer, (unsigned long)sk); diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 7884159937aa..1773a8dd36d8 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -70,13 +70,6 @@ enum ccid3_options { TFRC_OPT_RECEIVE_RATE = 194, }; -/* TFRC sender states */ -enum ccid3_hc_tx_states { - TFRC_SSTATE_NO_SENT = 1, - TFRC_SSTATE_NO_FBACK, - TFRC_SSTATE_FBACK, -}; - /** struct ccid3_hc_tx_sock - CCID3 sender half-connection socket * * @x - Current sending rate in 64 * bytes per second @@ -87,7 +80,7 @@ enum ccid3_hc_tx_states { * @s - Packet size in bytes * @t_rto - Nofeedback Timer setting in usecs * @t_ipi - Interpacket (send) interval (RFC 3448, 4.6) in usecs - * @state - Sender state, one of %ccid3_hc_tx_states + * @feedback - Whether feedback has been received or not * @last_win_count - Last window counter sent * @t_last_win_count - Timestamp of earliest packet with * last_win_count value sent @@ -105,7 +98,7 @@ struct ccid3_hc_tx_sock { u32 t_rto; u32 t_ipi; u16 s; - enum ccid3_hc_tx_states state:8; + bool feedback:1; u8 last_win_count; ktime_t t_last_win_count; struct timer_list no_feedback_timer; From 8b67ad12b04ef7bdf5d2b4de24fe5a609b26cf12 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 090/105] dccp tfrc: Suppress unavoidable "below resolution" warning In the congestion-avoidance phase a decay of p towards 0 is natural once fewer losses are encountered. Hence the warning message "p is below resolution" is not necessary, and thus turned into a debug message by this patch. The TFRC_SMALLEST_P is needed since in theory p never actually reaches 0. When no further losses are encountered, the loss interval I_0 grows in length, causing p to decrease towards 0, causing X_calc = s/(RTT * f(p)) to increase. With the given minimum-resolution this congestion avoidance phase stops at some fixed value, an approximation formula has been added to the documentation. Signed-off-by: Gerrit Renker --- net/dccp/ccids/lib/tfrc_equation.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index bc3dc2b2a849..38239c4d5e14 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -632,8 +632,16 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ - DCCP_WARN("Value of p (%d) below resolution. " - "Substituting %d\n", p, TFRC_SMALLEST_P); + /* + * In the congestion-avoidance phase p decays towards 0 + * when there are no further losses, so this case is + * natural. Truncating to p_min = 0.01% means that the + * maximum achievable throughput is limited to about + * X_calc_max = 122.4 * s/RTT (see RFC 3448, 3.1); e.g. + * with s=1500 bytes, RTT=0.01 s: X_calc_max = 147 Mbps. + */ + tfrc_pr_debug("Value of p (%d) below resolution. " + "Substituting %d\n", p, TFRC_SMALLEST_P); index = 0; } else /* 0.0001 <= p <= 0.05 */ index = p/TFRC_SMALLEST_P - 1; From 24b8d343215919c7a2ba18b9f89a0961e1459cad Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 091/105] dccp tfrc: Receiver history initialisation routine This patch 1) separates history allocation and initialisation, to facilitate early loss detection (implemented by a subsequent patch); 2) removes duplication by using the existing tfrc_rx_hist_purge() if the allocation fails. This is now possible, since the initialisation routine 3) zeroes out the entire history before using it. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 2 +- net/dccp/ccids/lib/packet_history.c | 52 +++++++++++++++++------------ net/dccp/ccids/lib/packet_history.h | 2 +- 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 5470a978be02..36f4992f3c38 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -766,7 +766,7 @@ static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) hcrx->state = TFRC_RSTATE_NO_DATA; tfrc_lh_init(&hcrx->li_hist); - return tfrc_rx_hist_alloc(&hcrx->hist); + return tfrc_rx_hist_init(&hcrx->hist, sk); } static void ccid3_hc_rx_exit(struct sock *sk) diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 5c4450866904..5b4e1cf8439d 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -352,28 +352,6 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, } EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss); -int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) -{ - int i; - - for (i = 0; i <= TFRC_NDUPACK; i++) { - h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); - if (h->ring[i] == NULL) - goto out_free; - } - - h->loss_count = h->loss_start = 0; - return 0; - -out_free: - while (i-- != 0) { - kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]); - h->ring[i] = NULL; - } - return -ENOBUFS; -} -EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc); - void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) { int i; @@ -386,6 +364,36 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) } EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); +static int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) +{ + int i; + + memset(h, 0, sizeof(*h)); + + for (i = 0; i <= TFRC_NDUPACK; i++) { + h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC); + if (h->ring[i] == NULL) { + tfrc_rx_hist_purge(h); + return -ENOBUFS; + } + } + return 0; +} + +int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk) +{ + if (tfrc_rx_hist_alloc(h)) + return -ENOBUFS; + /* + * Initialise first entry with GSR to start loss detection as early as + * possible. Code using this must not use any other fields. The entry + * will be overwritten once the CCID updates its received packets. + */ + tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno = dccp_sk(sk)->dccps_gsr; + return 0; +} +EXPORT_SYMBOL_GPL(tfrc_rx_hist_init); + /** * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against */ diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 221d8102da51..e9d8097947d5 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -153,7 +153,7 @@ extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, struct sock *sk); extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb); -extern int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h); +extern int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk); extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); #endif /* _DCCP_PKT_HIST_ */ From d20ed95f8bf3d98d31dbbab8b00bb4c1a4a140f3 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 092/105] dccp tfrc: Perform early loss detection This enables the TFRC code to begin loss detection (as soon as the module is loaded), using the latest updates from rfc3448bis-06, 6.3.1: * when the first data packet(s) are lost or marked, set * X_target = s/(2*R) => f(p) = s/(R * X_target) = 2, * corresponding to a loss rate of ~ 20.64%. The handle_loss() function is now called right at the begin of rx_packet_recv() and thus no longer protected against duplicates: hence a call to rx_duplicate() has been added. Such a call makes sense now, as the previous patch initialises the first entry with a sequence number of GSR. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 48 +++++++++++++++++++++++------ net/dccp/ccids/lib/packet_history.c | 3 ++ 2 files changed, 42 insertions(+), 9 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 36f4992f3c38..0a7c22598d62 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -577,6 +577,28 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, hcrx->p_inverse = ~0U; /* see RFC 4342, 8.5 */ break; case CCID3_FBACK_PARAM_CHANGE: + if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA)) { + /* + * rfc3448bis-06, 6.3.1: First packet(s) lost or marked + * FIXME: in rfc3448bis the receiver returns X_recv=0 + * here as it normally would in the first feedback packet. + * However this is not possible yet, since the code still + * uses RFC 3448, i.e. + * If (p > 0) + * Calculate X_calc using the TCP throughput equation. + * X = max(min(X_calc, 2*X_recv), s/t_mbi); + * would bring X down to s/t_mbi. That is why we return + * X_recv according to rfc3448bis-06 for the moment. + */ + u32 rtt = hcrx->rtt ? : DCCP_FALLBACK_RTT, s = hcrx->s; + + if (s == 0) { + DCCP_WARN("No sample for s, using fallback\n"); + s = TCP_MIN_RCVMSS; + } + hcrx->x_recv = scaled_div32(s, 2 * rtt); + break; + } /* * When parameters change (new loss or p > p_prev), we do not * have a reliable estimate for R_m of [RFC 3448, 6.2] and so @@ -650,6 +672,14 @@ static u32 ccid3_first_li(struct sock *sk) u32 x_recv, p, delta; u64 fval; + /* + * rfc3448bis-06, 6.3.1: First data packet(s) are marked or lost. Set p + * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p + * is about 20.64%. This yields an interval length of 4.84 (rounded up). + */ + if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA)) + return 5; + if (hcrx->rtt == 0) { DCCP_WARN("No RTT estimate available, using fallback RTT\n"); hcrx->rtt = DCCP_FALLBACK_RTT; @@ -683,6 +713,15 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; const bool is_data_packet = dccp_data_packet(skb); + /* + * Perform loss detection and handle pending losses + */ + if (tfrc_rx_handle_loss(&hcrx->hist, &hcrx->li_hist, + skb, ndp, ccid3_first_li, sk)) { + do_feedback = CCID3_FBACK_PARAM_CHANGE; + goto done_receiving; + } + if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA)) { if (is_data_packet) { const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; @@ -710,15 +749,6 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) hcrx->bytes_recv += payload; } - /* - * Perform loss detection and handle pending losses - */ - if (tfrc_rx_handle_loss(&hcrx->hist, &hcrx->li_hist, - skb, ndp, ccid3_first_li, sk)) { - do_feedback = CCID3_FBACK_PARAM_CHANGE; - goto done_receiving; - } - if (tfrc_rx_hist_loss_pending(&hcrx->hist)) return; /* done receiving */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 5b4e1cf8439d..8db34225c002 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -335,6 +335,9 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, { int is_new_loss = 0; + if (tfrc_rx_hist_duplicate(h, skb)) + return 0; + if (h->loss_count == 0) { __do_track_loss(h, skb, ndp); } else if (h->loss_count == 1) { From 3ca7aea04152255bb65275b0018d3c673bc1f4e7 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 093/105] dccp tfrc: Return type of update_i_mean is void This changes the return type of tfrc_lh_update_i_mean() to void, since that function returns always `false'. This is due to len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ return 0; which means that update_i_mean can only increase the length of the open loss interval I_0, and hence the value of I_tot0 (RFC 3448, 5.4). Consequently the test `i_mean < old_i_mean' at the end of the function always evaluates to false. There is no known way by which a loss interval can suddenly become shorter, therefore the return type of the function is changed to void. (That is, under the given circumstances step (3) in RFC 3448, 6.1 will not occur.) Further changes: ---------------- * the function is now called from tfrc_rx_handle_loss, which is equivalent to the previous way of calling from rx_packet_recv (it was called whenever there was no new or pending loss, now it is also updated when there is a pending loss - this increases the accuracy a bit); * added a FIXME to possibly consider NDP counting as per RFC 4342 (this is not implemented yet). Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 8 -------- net/dccp/ccids/lib/loss_interval.c | 20 +++++++++++--------- net/dccp/ccids/lib/loss_interval.h | 2 +- net/dccp/ccids/lib/packet_history.c | 5 +++++ 4 files changed, 17 insertions(+), 18 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 0a7c22598d62..50dac01e48c5 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -767,15 +767,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) */ if (sample != 0) hcrx->rtt = tfrc_ewma(hcrx->rtt, sample, 9); - - } else if (tfrc_lh_update_i_mean(&hcrx->li_hist, skb)) { - /* - * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean - * has decreased (resp. p has increased), send feedback now. - */ - do_feedback = CCID3_FBACK_PARAM_CHANGE; } - /* * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 */ diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index 5b3ce0688c5c..fe5c2a31c770 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -86,21 +86,26 @@ static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) /** * tfrc_lh_update_i_mean - Update the `open' loss interval I_0 - * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev + * This updates I_mean as the sequence numbers increase. As a consequence, the + * open loss interval I_0 increases, hence p = W_tot/max(I_tot0, I_tot1) + * decreases, and thus there is no need to send renewed feedback. */ -u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) +void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) { struct tfrc_loss_interval *cur = tfrc_lh_peek(lh); - u32 old_i_mean = lh->i_mean; s64 len; if (cur == NULL) /* not initialised */ - return 0; + return; + + /* FIXME: should probably also count non-data packets (RFC 4342, 6.1) */ + if (!dccp_data_packet(skb)) + return; len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1; if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */ - return 0; + return; if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4) /* @@ -114,14 +119,11 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) cur->li_is_closed = 1; if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */ - return 0; + return; cur->li_length = len; tfrc_lh_calc_i_mean(lh); - - return (lh->i_mean < old_i_mean); } -EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean); /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h index 246018a3b269..f101ae2cf528 100644 --- a/net/dccp/ccids/lib/loss_interval.h +++ b/net/dccp/ccids/lib/loss_interval.h @@ -69,7 +69,7 @@ struct tfrc_rx_hist; extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, u32 (*first_li)(struct sock *), struct sock *); -extern u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); +extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); #endif /* _DCCP_LI_HIST_ */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 8db34225c002..8ea96903033d 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -351,6 +351,11 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); __three_after_loss(h); } + + /* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */ + if (!is_new_loss) + tfrc_lh_update_i_mean(lh, skb); + return is_new_loss; } EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss); From 34a081be8e14b7ada70e069b65b05d54db4af497 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 094/105] dccp tfrc: Let dccp_tfrc_lib do the sampling work This migrates more TFRC-related code into the dccp_tfrc_lib: * sampling of the packet size `s' (which is only needed until the first loss interval is computed (ccid3_first_li)); * updating the byte-counter `bytes_recvd' in between sending feedbacks. The result is a better separation of CCID-3 specific and TFRC specific code, which aids future integration with ECN and e.g. CCID-4. Further changes: ---------------- * replaced magic number of 536 with equivalent constant TCP_MIN_RCVMSS; (this constant is also used when no estimate for `s' is available). Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 38 ++++++----------------------- net/dccp/ccids/ccid3.h | 4 --- net/dccp/ccids/lib/packet_history.c | 10 ++++++++ net/dccp/ccids/lib/packet_history.h | 16 ++++++++++++ net/dccp/proto.c | 2 +- 5 files changed, 35 insertions(+), 35 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 50dac01e48c5..8744590acb34 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -590,12 +590,9 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, * would bring X down to s/t_mbi. That is why we return * X_recv according to rfc3448bis-06 for the moment. */ - u32 rtt = hcrx->rtt ? : DCCP_FALLBACK_RTT, s = hcrx->s; + u32 rtt = hcrx->rtt ? : DCCP_FALLBACK_RTT, + s = tfrc_rx_hist_packet_size(&hcrx->hist); - if (s == 0) { - DCCP_WARN("No sample for s, using fallback\n"); - s = TCP_MIN_RCVMSS; - } hcrx->x_recv = scaled_div32(s, 2 * rtt); break; } @@ -617,7 +614,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, if (delta <= 0) DCCP_BUG("delta (%ld) <= 0", (long)delta); else - hcrx->x_recv = scaled_div32(hcrx->bytes_recv, delta); + hcrx->x_recv = scaled_div32(hcrx->hist.bytes_recvd, delta); break; default: return; @@ -628,7 +625,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, hcrx->tstamp_last_feedback = now; hcrx->last_counter = dccp_hdr(skb)->dccph_ccval; - hcrx->bytes_recv = 0; + hcrx->hist.bytes_recvd = 0; dp->dccps_hc_rx_insert_options = 1; dccp_send_ack(sk); @@ -669,7 +666,8 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) static u32 ccid3_first_li(struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - u32 x_recv, p, delta; + u32 x_recv, p, delta, + s = tfrc_rx_hist_packet_size(&hcrx->hist); u64 fval; /* @@ -686,7 +684,7 @@ static u32 ccid3_first_li(struct sock *sk) } delta = ktime_to_us(net_timedelta(hcrx->tstamp_last_feedback)); - x_recv = scaled_div32(hcrx->bytes_recv, delta); + x_recv = scaled_div32(hcrx->hist.bytes_recvd, delta); if (x_recv == 0) { /* would also trigger divide-by-zero */ DCCP_WARN("X_recv==0\n"); if (hcrx->x_recv == 0) { @@ -696,8 +694,7 @@ static u32 ccid3_first_li(struct sock *sk) x_recv = hcrx->x_recv; } - fval = scaled_div(hcrx->s, hcrx->rtt); - fval = scaled_div32(fval, x_recv); + fval = scaled_div32(scaled_div(s, hcrx->rtt), x_recv); p = tfrc_calc_x_reverse_lookup(fval); ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " @@ -724,31 +721,12 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA)) { if (is_data_packet) { - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; do_feedback = CCID3_FBACK_INITIAL; ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); - hcrx->s = payload; - /* - * Not necessary to update bytes_recv here, - * since X_recv = 0 for the first feedback packet (cf. - * RFC 3448, 6.3) -- gerrit - */ } goto update_records; } - if (tfrc_rx_hist_duplicate(&hcrx->hist, skb)) - return; /* done receiving */ - - if (is_data_packet) { - const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; - /* - * Update moving-average of s and the sum of received payload bytes - */ - hcrx->s = tfrc_ewma(hcrx->s, payload, 9); - hcrx->bytes_recv += payload; - } - if (tfrc_rx_hist_loss_pending(&hcrx->hist)) return; /* done receiving */ diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 1773a8dd36d8..0c4fadd85f94 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -124,25 +124,21 @@ enum ccid3_hc_rx_states { * * @last_counter - Tracks window counter (RFC 4342, 8.1) * @state - Receiver state, one of %ccid3_hc_rx_states - * @bytes_recv - Total sum of DCCP payload bytes * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) * @rtt - Receiver estimate of RTT * @tstamp_last_feedback - Time at which last feedback was sent * @hist - Packet history (loss detection + RTT sampling) * @li_hist - Loss Interval database - * @s - Received packet size in bytes * @p_inverse - Inverse of Loss Event Rate (RFC 4342, sec. 8.5) */ struct ccid3_hc_rx_sock { u8 last_counter:4; enum ccid3_hc_rx_states state:8; - u32 bytes_recv; u32 x_recv; u32 rtt; ktime_t tstamp_last_feedback; struct tfrc_rx_hist hist; struct tfrc_loss_hist li_hist; - u16 s; #define p_inverse li_hist.i_mean }; diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 8ea96903033d..ee34b4564242 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -352,6 +352,16 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, __three_after_loss(h); } + /* + * Update moving-average of `s' and the sum of received payload bytes. + */ + if (dccp_data_packet(skb)) { + const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4; + + h->packet_size = tfrc_ewma(h->packet_size, payload, 9); + h->bytes_recvd += payload; + } + /* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */ if (!is_new_loss) tfrc_lh_update_i_mean(lh, skb); diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index e9d8097947d5..b7c87a1a2720 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -91,12 +91,16 @@ struct tfrc_rx_hist_entry { * @loss_count: Number of entries in circular history * @loss_start: Movable index (for loss detection) * @rtt_sample_prev: Used during RTT sampling, points to candidate entry + * @packet_size: Packet size in bytes (as per RFC 3448, 3.1) + * @bytes_recvd: Number of bytes received since last sending feedback */ struct tfrc_rx_hist { struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; u8 loss_count:2, loss_start:2; #define rtt_sample_prev loss_start + u32 packet_size, + bytes_recvd; }; /** @@ -140,6 +144,18 @@ static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h) return h->loss_count > 0; } +/* + * Accessor functions to retrieve parameters sampled by the RX history + */ +static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h) +{ + if (h->packet_size == 0) { + DCCP_WARN("No sample for s, using fallback\n"); + return TCP_MIN_RCVMSS; + } + return h->packet_size; +} + extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb, const u64 ndp); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index a3caa11aa836..ecf3be961e11 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -185,7 +185,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; icsk->icsk_sync_mss = dccp_sync_mss; - dp->dccps_mss_cache = 536; + dp->dccps_mss_cache = TCP_MIN_RCVMSS; dp->dccps_rate_last = jiffies; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT; From 2f3e3bbad917c426d3aba03a535809e5699de156 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 095/105] dccp ccid-3: Remove duplicate RX states The only state information that the CCID-3 receiver keeps is whether initial feedback has been sent or not. Further, this overlaps with use of feedback: * state == TFRC_RSTATE_NO_DATA as long as no feedback has been sent; * state == TFRC_RSTATE_DATA as soon as the first feedback has been sent. This patch reduces the duplication, by memorising the type of the last feedback. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 47 +++++------------------------------------- net/dccp/ccids/ccid3.h | 14 +++++++------ 2 files changed, 13 insertions(+), 48 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 8744590acb34..04b183548aa8 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -528,40 +528,6 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, /* * Receiver Half-Connection Routines */ - -/* CCID3 feedback types */ -enum ccid3_fback_type { - CCID3_FBACK_NONE = 0, - CCID3_FBACK_INITIAL, - CCID3_FBACK_PERIODIC, - CCID3_FBACK_PARAM_CHANGE -}; - -#ifdef CONFIG_IP_DCCP_CCID3_DEBUG -static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) -{ - static char *ccid3_rx_state_names[] = { - [TFRC_RSTATE_NO_DATA] = "NO_DATA", - [TFRC_RSTATE_DATA] = "DATA", - }; - - return ccid3_rx_state_names[state]; -} -#endif - -static void ccid3_hc_rx_set_state(struct sock *sk, - enum ccid3_hc_rx_states state) -{ - struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - enum ccid3_hc_rx_states oldstate = hcrx->state; - - ccid3_pr_debug("%s(%p) %-8.8s -> %s\n", - dccp_role(sk), sk, ccid3_rx_state_name(oldstate), - ccid3_rx_state_name(state)); - WARN_ON(state == oldstate); - hcrx->state = state; -} - static void ccid3_hc_rx_send_feedback(struct sock *sk, const struct sk_buff *skb, enum ccid3_fback_type fbtype) @@ -577,7 +543,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, hcrx->p_inverse = ~0U; /* see RFC 4342, 8.5 */ break; case CCID3_FBACK_PARAM_CHANGE: - if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA)) { + if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) { /* * rfc3448bis-06, 6.3.1: First packet(s) lost or marked * FIXME: in rfc3448bis the receiver returns X_recv=0 @@ -626,6 +592,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, hcrx->tstamp_last_feedback = now; hcrx->last_counter = dccp_hdr(skb)->dccph_ccval; hcrx->hist.bytes_recvd = 0; + hcrx->feedback = fbtype; dp->dccps_hc_rx_insert_options = 1; dccp_send_ack(sk); @@ -675,7 +642,7 @@ static u32 ccid3_first_li(struct sock *sk) * to give the equivalent of X_target = s/(2*R). Thus fval = 2 and so p * is about 20.64%. This yields an interval length of 4.84 (rounded up). */ - if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA)) + if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) return 5; if (hcrx->rtt == 0) { @@ -719,11 +686,9 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) goto done_receiving; } - if (unlikely(hcrx->state == TFRC_RSTATE_NO_DATA)) { - if (is_data_packet) { + if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) { + if (is_data_packet) do_feedback = CCID3_FBACK_INITIAL; - ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA); - } goto update_records; } @@ -764,7 +729,6 @@ static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid_priv(ccid); - hcrx->state = TFRC_RSTATE_NO_DATA; tfrc_lh_init(&hcrx->li_hist); return tfrc_rx_hist_init(&hcrx->hist, sk); } @@ -779,7 +743,6 @@ static void ccid3_hc_rx_exit(struct sock *sk) static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) { - info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->state; info->tcpi_options |= TCPI_OPT_TIMESTAMPS; info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rtt; } diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 0c4fadd85f94..72e110a1100f 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -114,16 +114,18 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) return hctx; } -/* TFRC receiver states */ -enum ccid3_hc_rx_states { - TFRC_RSTATE_NO_DATA = 1, - TFRC_RSTATE_DATA, + +enum ccid3_fback_type { + CCID3_FBACK_NONE = 0, + CCID3_FBACK_INITIAL, + CCID3_FBACK_PERIODIC, + CCID3_FBACK_PARAM_CHANGE }; /** struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket * * @last_counter - Tracks window counter (RFC 4342, 8.1) - * @state - Receiver state, one of %ccid3_hc_rx_states + * @feedback - The type of the feedback last sent * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) * @rtt - Receiver estimate of RTT * @tstamp_last_feedback - Time at which last feedback was sent @@ -133,7 +135,7 @@ enum ccid3_hc_rx_states { */ struct ccid3_hc_rx_sock { u8 last_counter:4; - enum ccid3_hc_rx_states state:8; + enum ccid3_fback_type feedback:4; u32 x_recv; u32 rtt; ktime_t tstamp_last_feedback; From 2b81143aa3505e2460b24b357996c2f21840ea58 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 096/105] dccp ccid-3: Always perform receiver RTT sampling This updates the CCID-3 receiver in part with regard to errata 610 and 611 (http://www.rfc-editor.org/errata_list.php), which change RFC 4342 to use the Receive Rate as specified in rfc3448bis, requiring to constantly sample the RTT (or use a sender RTT). Doing this requires reusing the RX history structure after dealing with a loss. The patch does not resolve how to compute X_recv if the interval is less than 1 RTT. A FIXME has been added (and is resolved in subsequent patch). Furthermore, since this is all TFRC-based functionality, the RTT estimation is now also performed by the dccp_tfrc_lib module. This further simplifies the CCID-3 code. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 43 +++++++-------------- net/dccp/ccids/ccid3.h | 2 - net/dccp/ccids/lib/packet_history.c | 60 +++++++++++++++++++++-------- net/dccp/ccids/lib/packet_history.h | 17 +++++++- 4 files changed, 73 insertions(+), 49 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 04b183548aa8..8e64d9665a21 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -556,8 +556,8 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, * would bring X down to s/t_mbi. That is why we return * X_recv according to rfc3448bis-06 for the moment. */ - u32 rtt = hcrx->rtt ? : DCCP_FALLBACK_RTT, - s = tfrc_rx_hist_packet_size(&hcrx->hist); + u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), + rtt = tfrc_rx_hist_rtt(&hcrx->hist); hcrx->x_recv = scaled_div32(s, 2 * rtt); break; @@ -576,6 +576,11 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, break; /* fall through */ case CCID3_FBACK_PERIODIC: + /* + * FIXME: check if delta is less than or equal to 1 RTT using + * the receiver RTT sample. This is described in Errata 610/611 + * of RFC 4342 which reference section 6.2 of RFC 3448. + */ delta = ktime_us_delta(now, hcrx->tstamp_last_feedback); if (delta <= 0) DCCP_BUG("delta (%ld) <= 0", (long)delta); @@ -633,8 +638,8 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) static u32 ccid3_first_li(struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - u32 x_recv, p, delta, - s = tfrc_rx_hist_packet_size(&hcrx->hist); + u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), + rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p, delta; u64 fval; /* @@ -645,11 +650,6 @@ static u32 ccid3_first_li(struct sock *sk) if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) return 5; - if (hcrx->rtt == 0) { - DCCP_WARN("No RTT estimate available, using fallback RTT\n"); - hcrx->rtt = DCCP_FALLBACK_RTT; - } - delta = ktime_to_us(net_timedelta(hcrx->tstamp_last_feedback)); x_recv = scaled_div32(hcrx->hist.bytes_recvd, delta); if (x_recv == 0) { /* would also trigger divide-by-zero */ @@ -661,7 +661,7 @@ static u32 ccid3_first_li(struct sock *sk) x_recv = hcrx->x_recv; } - fval = scaled_div32(scaled_div(s, hcrx->rtt), x_recv); + fval = scaled_div32(scaled_div(s, rtt), x_recv); p = tfrc_calc_x_reverse_lookup(fval); ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " @@ -695,26 +695,11 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) if (tfrc_rx_hist_loss_pending(&hcrx->hist)) return; /* done receiving */ - /* - * Handle data packets: RTT sampling and monitoring p - */ - if (unlikely(!is_data_packet)) - goto update_records; - - if (!tfrc_lh_is_initialised(&hcrx->li_hist)) { - const u32 sample = tfrc_rx_hist_sample_rtt(&hcrx->hist, skb); - /* - * Empty loss history: no loss so far, hence p stays 0. - * Sample RTT values, since an RTT estimate is required for the - * computation of p when the first loss occurs; RFC 3448, 6.3.1. - */ - if (sample != 0) - hcrx->rtt = tfrc_ewma(hcrx->rtt, sample, 9); - } /* * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 */ - if (SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3) + if (is_data_packet && + SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3) do_feedback = CCID3_FBACK_PERIODIC; update_records: @@ -744,7 +729,7 @@ static void ccid3_hc_rx_exit(struct sock *sk) static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) { info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rtt; + info->tcpi_rcv_rtt = tfrc_rx_hist_rtt(&ccid3_hc_rx_sk(sk)->hist); } static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, @@ -759,7 +744,7 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, if (len < sizeof(rx_info)) return -EINVAL; rx_info.tfrcrx_x_recv = hcrx->x_recv; - rx_info.tfrcrx_rtt = hcrx->rtt; + rx_info.tfrcrx_rtt = tfrc_rx_hist_rtt(&hcrx->hist); rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hcrx->p_inverse); len = sizeof(rx_info); val = &rx_info; diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 72e110a1100f..342235c57bf3 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -127,7 +127,6 @@ enum ccid3_fback_type { * @last_counter - Tracks window counter (RFC 4342, 8.1) * @feedback - The type of the feedback last sent * @x_recv - Receiver estimate of send rate (RFC 3448, sec. 4.3) - * @rtt - Receiver estimate of RTT * @tstamp_last_feedback - Time at which last feedback was sent * @hist - Packet history (loss detection + RTT sampling) * @li_hist - Loss Interval database @@ -137,7 +136,6 @@ struct ccid3_hc_rx_sock { u8 last_counter:4; enum ccid3_fback_type feedback:4; u32 x_recv; - u32 rtt; ktime_t tstamp_last_feedback; struct tfrc_rx_hist hist; struct tfrc_loss_hist li_hist; diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index ee34b4564242..e2e250aa5c89 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -151,14 +151,31 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); + +static void __tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) +{ + struct tfrc_rx_hist_entry *tmp = h->ring[a]; + + h->ring[a] = h->ring[b]; + h->ring[b] = tmp; +} + static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) { - const u8 idx_a = tfrc_rx_hist_index(h, a), - idx_b = tfrc_rx_hist_index(h, b); - struct tfrc_rx_hist_entry *tmp = h->ring[idx_a]; + __tfrc_rx_hist_swap(h, tfrc_rx_hist_index(h, a), + tfrc_rx_hist_index(h, b)); +} - h->ring[idx_a] = h->ring[idx_b]; - h->ring[idx_b] = tmp; +/** + * tfrc_rx_hist_resume_rtt_sampling - Prepare RX history for RTT sampling + * This is called after loss detection has finished, when the history entry + * with the index of `loss_count' holds the highest-received sequence number. + * RTT sampling requires this information at ring[0] (tfrc_rx_hist_sample_rtt). + */ +static inline void tfrc_rx_hist_resume_rtt_sampling(struct tfrc_rx_hist *h) +{ + __tfrc_rx_hist_swap(h, 0, tfrc_rx_hist_index(h, h->loss_count)); + h->loss_count = h->loss_start = 0; } /* @@ -200,8 +217,7 @@ static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2 if (dccp_loss_free(s2, s1, n1)) { /* hole is filled: S0, S2, and S1 are consecutive */ - h->loss_count = 0; - h->loss_start = tfrc_rx_hist_index(h, 1); + tfrc_rx_hist_resume_rtt_sampling(h); } else /* gap between S2 and S1: just update loss_prev */ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2); @@ -254,8 +270,7 @@ static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3) if (dccp_loss_free(s1, s2, n2)) { /* entire hole filled by S0, S3, S1, S2 */ - h->loss_start = tfrc_rx_hist_index(h, 2); - h->loss_count = 0; + tfrc_rx_hist_resume_rtt_sampling(h); } else { /* gap remains between S1 and S2 */ h->loss_start = tfrc_rx_hist_index(h, 1); @@ -299,8 +314,7 @@ static void __three_after_loss(struct tfrc_rx_hist *h) if (dccp_loss_free(s2, s3, n3)) { /* no gap between S2 and S3: entire hole is filled */ - h->loss_start = tfrc_rx_hist_index(h, 3); - h->loss_count = 0; + tfrc_rx_hist_resume_rtt_sampling(h); } else { /* gap between S2 and S3 */ h->loss_start = tfrc_rx_hist_index(h, 2); @@ -340,6 +354,7 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, if (h->loss_count == 0) { __do_track_loss(h, skb, ndp); + tfrc_rx_hist_sample_rtt(h, skb); } else if (h->loss_count == 1) { __one_after_loss(h, skb, ndp); } else if (h->loss_count != 2) { @@ -435,11 +450,24 @@ static inline struct tfrc_rx_hist_entry * * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able * to compute a sample with given data - calling function should check this. */ -u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) +void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) { - u32 sample = 0, - delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + u32 sample = 0, delta_v; + + /* + * When not to sample: + * - on non-data packets + * (RFC 4342, 8.1: CCVal only fully defined for data packets); + * - when no data packets have been received yet + * (FIXME: using sampled packet size as indicator here); + * - as long as there are gaps in the sequence space (pending loss). + */ + if (!dccp_data_packet(skb) || h->packet_size == 0 || + tfrc_rx_hist_loss_pending(h)) + return; + + delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, + tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ if (h->rtt_sample_prev == 2) { /* previous candidate stored */ @@ -479,6 +507,6 @@ u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) h->rtt_sample_prev = 0; /* use current entry as next reference */ keep_ref_for_next_time: - return sample; + h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 9); } EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt); diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index b7c87a1a2720..ba5832bbc348 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -91,6 +91,7 @@ struct tfrc_rx_hist_entry { * @loss_count: Number of entries in circular history * @loss_start: Movable index (for loss detection) * @rtt_sample_prev: Used during RTT sampling, points to candidate entry + * @rtt_estimate: Receiver RTT estimate * @packet_size: Packet size in bytes (as per RFC 3448, 3.1) * @bytes_recvd: Number of bytes received since last sending feedback */ @@ -98,7 +99,10 @@ struct tfrc_rx_hist { struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; u8 loss_count:2, loss_start:2; + /* Receiver RTT sampling */ #define rtt_sample_prev loss_start + u32 rtt_estimate; + /* Receiver sampling of application payload lengths */ u32 packet_size, bytes_recvd; }; @@ -154,6 +158,15 @@ static inline u32 tfrc_rx_hist_packet_size(const struct tfrc_rx_hist *h) return TCP_MIN_RCVMSS; } return h->packet_size; + +} +static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h) +{ + if (h->rtt_estimate == 0) { + DCCP_WARN("No RTT estimate available, using fallback RTT\n"); + return DCCP_FALLBACK_RTT; + } + return h->rtt_estimate; } extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, @@ -167,8 +180,8 @@ extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, const u64 ndp, u32 (*first_li)(struct sock *sk), struct sock *sk); -extern u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, - const struct sk_buff *skb); +extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, + const struct sk_buff *skb); extern int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk); extern void tfrc_rx_hist_purge(struct tfrc_rx_hist *h); From 49ffc29a0223adbe0ea7005eea3ab2a03abbeb06 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 097/105] dccp: Clamping RTT values This extracts the clamping part of dccp_sample_rtt() and makes it available to other parts of the code (as e.g. used in the next patch). Note: The function dccp_sample_rtt() now reduces to subtracting the elapsed time. This could be eliminated but would require shorter prefixes and thus is not done by this patch - maybe an idea for later. Signed-off-by: Gerrit Renker --- net/dccp/dccp.h | 9 ++++++++- net/dccp/input.c | 11 +---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index b63a82ccb2b2..5281190aa19c 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -334,7 +334,14 @@ extern struct sk_buff *dccp_ctl_make_reset(struct sock *sk, extern int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code); extern void dccp_send_close(struct sock *sk, const int active); extern int dccp_invalid_packet(struct sk_buff *skb); -extern u32 dccp_sample_rtt(struct sock *sk, long delta); + +static inline u32 dccp_sane_rtt(long usec_sample) +{ + if (unlikely(usec_sample <= 0 || usec_sample > DCCP_SANE_RTT_MAX)) + DCCP_WARN("RTT sample %ld out of bounds!\n", usec_sample); + return clamp_val(usec_sample, DCCP_SANE_RTT_MIN, DCCP_SANE_RTT_MAX); +} +extern u32 dccp_sample_rtt(struct sock *sk, long delta); static inline int dccp_bad_service_code(const struct sock *sk, const __be32 service) diff --git a/net/dccp/input.c b/net/dccp/input.c index b1e38bf94456..df0e6714aa11 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -707,16 +707,7 @@ u32 dccp_sample_rtt(struct sock *sk, long delta) /* dccpor_elapsed_time is either zeroed out or set and > 0 */ delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10; - if (unlikely(delta <= 0)) { - DCCP_WARN("unusable RTT sample %ld, using min\n", delta); - return DCCP_SANE_RTT_MIN; - } - if (unlikely(delta > DCCP_SANE_RTT_MAX)) { - DCCP_WARN("RTT sample %ld too large, using max\n", delta); - return DCCP_SANE_RTT_MAX; - } - - return delta; + return dccp_sane_rtt(delta); } EXPORT_SYMBOL_GPL(dccp_sample_rtt); From 22338f09bd60434a3f1d6608f0fa55972067985f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 098/105] dccp tfrc: Increase number of RTT samples This improves the receiver RTT sampling algorithm so that it tries harder to get as many RTT samples as possible. The algorithm is based the concepts presented in RFC 4340, 8.1, using timestamps and the CCVal window counter. There exist 4 cases for the CCVal difference: * == 0: less than RTT/4 passed since last packet -- unusable; * > 4: (much) more than 1 RTT has passed since last packet -- also unusable; * == 4: perfect sample (exactly one RTT has passed since last packet); * 1..3: sub-optimal sample (between RTT/4 and 3*RTT/4 has passed). In the last case the algorithm tried to optimise by storing away the candidate and then re-trying next time. The problem is that * a large number of samples is needed to smooth out the inaccuracies of the algorithm; * the sender may not be sending enough packets to warrant a "next time"; * hence it is better to use suboptimal samples whenever possible. The algorithm now stores away the current sample only if the difference is 0. Applicability and background ---------------------------- A realistic example is MP3 streaming where packets are sent at a rate of less than one packet per RTT, which means that suitable samples are absent for a very long time. The effectiveness of using suboptimal samples (with a delta between 1 and 4) was confirmed by instrumenting the algorithm with counters. The results of two 20 second test runs were: * With the old algorithm and a total of 38442 function calls, only 394 of these calls resulted in usable RTT samples (about 1%), and 378 out of these were "perfect" samples and 28013 (unused) samples had a delta of 1..3. * With the new algorithm and a total of 37057 function calls, 1702 usable RTT samples were retrieved (about 4.6%), 5 out of these were "perfect" samples. Signed-off-by: Gerrit Renker --- net/dccp/ccids/lib/packet_history.c | 83 +++++++++-------------------- 1 file changed, 24 insertions(+), 59 deletions(-) diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index e2e250aa5c89..5c4ded1cf422 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -427,32 +427,17 @@ int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk) } EXPORT_SYMBOL_GPL(tfrc_rx_hist_init); -/** - * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h) -{ - return h->ring[0]; -} - -/** - * tfrc_rx_hist_rtt_prev_s: previously suitable (wrt rtt_last_s) RTT-sampling entry - */ -static inline struct tfrc_rx_hist_entry * - tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h) -{ - return h->ring[h->rtt_sample_prev]; -} - /** * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal - * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able - * to compute a sample with given data - calling function should check this. + * Based on ideas presented in RFC 4342, 8.1. This function expects that no loss + * is pending and uses the following history entries (via rtt_sample_prev): + * - h->ring[0] contains the most recent history entry prior to @skb; + * - h->ring[1] is an unused `dummy' entry when the current difference is 0; */ void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) { - u32 sample = 0, delta_v; + struct tfrc_rx_hist_entry *last = h->ring[0]; + u32 sample, delta_v; /* * When not to sample: @@ -466,47 +451,27 @@ void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb) tfrc_rx_hist_loss_pending(h)) return; - delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); + h->rtt_sample_prev = 0; /* reset previous candidate */ - if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */ - if (h->rtt_sample_prev == 2) { /* previous candidate stored */ - sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - if (sample) - sample = 4 / sample * - ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp); - else /* - * FIXME: This condition is in principle not - * possible but occurs when CCID is used for - * two-way data traffic. I have tried to trace - * it, but the cause does not seem to be here. - */ - DCCP_BUG("please report to dccp@vger.kernel.org" - " => prev = %u, last = %u", - tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval, - tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval); - } else if (delta_v < 1) { - h->rtt_sample_prev = 1; - goto keep_ref_for_next_time; - } - - } else if (delta_v == 4) /* optimal match */ - sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp)); - else { /* suboptimal match */ - h->rtt_sample_prev = 2; - goto keep_ref_for_next_time; + delta_v = SUB16(dccp_hdr(skb)->dccph_ccval, last->tfrchrx_ccval); + if (delta_v == 0) { /* less than RTT/4 difference */ + h->rtt_sample_prev = 1; + return; } + sample = dccp_sane_rtt(ktime_to_us(net_timedelta(last->tfrchrx_tstamp))); - if (unlikely(sample > DCCP_SANE_RTT_MAX)) { - DCCP_WARN("RTT sample %u too large, using max\n", sample); - sample = DCCP_SANE_RTT_MAX; - } + if (delta_v <= 4) /* between RTT/4 and RTT */ + sample *= 4 / delta_v; + else if (!(sample < h->rtt_estimate && sample > h->rtt_estimate/2)) + /* + * Optimisation: CCVal difference is greater than 1 RTT, yet the + * sample is less than the local RTT estimate; which means that + * the RTT estimate is too high. + * To avoid noise, it is not done if the sample is below RTT/2. + */ + return; - h->rtt_sample_prev = 0; /* use current entry as next reference */ -keep_ref_for_next_time: - - h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 9); + /* Use a lower weight than usual to increase responsiveness */ + h->rtt_estimate = tfrc_ewma(h->rtt_estimate, sample, 5); } EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt); From 68c89ee53571a441799c03d5e240c6441bced620 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 099/105] dccp ccid-3: Update the computation of X_recv This updates the computation of X_recv with regard to Errata 610/611 for RFC 4342 and draft rfc3448bis-06, ensuring that at least an interval of 1 RTT is used to compute X_recv. The change is wrapped into a new function ccid3_hc_rx_x_recv(). Further changes: ---------------- * feedback is not sent when no data packets arrived (bytes_recv == 0), as per rfc3448bis-06, 6.2; * take the timestamp for the feedback /after/ dccp_send_ack() returns, to avoid taking the transmission time into account (in case layer-2 is busy); * clearer handling of failure in ccid3_first_li(). Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 64 +++++++++++------------------ net/dccp/ccids/lib/packet_history.c | 30 ++++++++++++++ net/dccp/ccids/lib/packet_history.h | 13 +++++- 3 files changed, 66 insertions(+), 41 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 8e64d9665a21..f2f9514dbad2 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -533,9 +533,6 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, enum ccid3_fback_type fbtype) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - struct dccp_sock *dp = dccp_sk(sk); - ktime_t now = ktime_get_real(); - s64 delta = 0; switch (fbtype) { case CCID3_FBACK_INITIAL: @@ -565,42 +562,33 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, /* * When parameters change (new loss or p > p_prev), we do not * have a reliable estimate for R_m of [RFC 3448, 6.2] and so - * need to reuse the previous value of X_recv. However, when - * X_recv was 0 (due to early loss), this would kill X down to - * s/t_mbi (i.e. one packet in 64 seconds). - * To avoid such drastic reduction, we approximate X_recv as - * the number of bytes since last feedback. - * This is a safe fallback, since X is bounded above by X_calc. + * always check whether at least RTT time units were covered. */ - if (hcrx->x_recv > 0) - break; - /* fall through */ + hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); + break; case CCID3_FBACK_PERIODIC: /* - * FIXME: check if delta is less than or equal to 1 RTT using - * the receiver RTT sample. This is described in Errata 610/611 - * of RFC 4342 which reference section 6.2 of RFC 3448. + * Step (2) of rfc3448bis-06, 6.2: + * - if no data packets have been received, just restart timer + * - if data packets have been received, re-compute X_recv */ - delta = ktime_us_delta(now, hcrx->tstamp_last_feedback); - if (delta <= 0) - DCCP_BUG("delta (%ld) <= 0", (long)delta); - else - hcrx->x_recv = scaled_div32(hcrx->hist.bytes_recvd, delta); + if (hcrx->hist.bytes_recvd == 0) + goto prepare_for_next_time; + hcrx->x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); break; default: return; } - ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", - (long)delta, hcrx->x_recv, hcrx->p_inverse); + ccid3_pr_debug("X_recv=%u, 1/p=%u\n", hcrx->x_recv, hcrx->p_inverse); - hcrx->tstamp_last_feedback = now; - hcrx->last_counter = dccp_hdr(skb)->dccph_ccval; - hcrx->hist.bytes_recvd = 0; - hcrx->feedback = fbtype; - - dp->dccps_hc_rx_insert_options = 1; + dccp_sk(sk)->dccps_hc_rx_insert_options = 1; dccp_send_ack(sk); + +prepare_for_next_time: + tfrc_rx_hist_restart_byte_counter(&hcrx->hist); + hcrx->last_counter = dccp_hdr(skb)->dccph_ccval; + hcrx->feedback = fbtype; } static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) @@ -639,7 +627,7 @@ static u32 ccid3_first_li(struct sock *sk) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); u32 s = tfrc_rx_hist_packet_size(&hcrx->hist), - rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p, delta; + rtt = tfrc_rx_hist_rtt(&hcrx->hist), x_recv, p; u64 fval; /* @@ -650,16 +638,9 @@ static u32 ccid3_first_li(struct sock *sk) if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) return 5; - delta = ktime_to_us(net_timedelta(hcrx->tstamp_last_feedback)); - x_recv = scaled_div32(hcrx->hist.bytes_recvd, delta); - if (x_recv == 0) { /* would also trigger divide-by-zero */ - DCCP_WARN("X_recv==0\n"); - if (hcrx->x_recv == 0) { - DCCP_BUG("stored value of X_recv is zero"); - return ~0U; - } - x_recv = hcrx->x_recv; - } + x_recv = tfrc_rx_hist_x_recv(&hcrx->hist, hcrx->x_recv); + if (x_recv == 0) + goto failed; fval = scaled_div32(scaled_div(s, rtt), x_recv); p = tfrc_calc_x_reverse_lookup(fval); @@ -667,7 +648,10 @@ static u32 ccid3_first_li(struct sock *sk) ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied " "loss rate=%u\n", dccp_role(sk), sk, x_recv, p); - return p == 0 ? ~0U : scaled_div(1, p); + if (p > 0) + return scaled_div(1, p); +failed: + return UINT_MAX; } static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 5c4ded1cf422..547ad098ea6a 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -385,6 +385,36 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, } EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss); +/* Compute the sending rate X_recv measured between feedback intervals */ +u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv) +{ + u64 bytes = h->bytes_recvd, last_rtt = h->rtt_estimate; + s64 delta = ktime_to_us(net_timedelta(h->bytes_start)); + + WARN_ON(delta <= 0); + /* + * Ensure that the sampling interval for X_recv is at least one RTT, + * by extending the sampling interval backwards in time, over the last + * R_(m-1) seconds, as per rfc3448bis-06, 6.2. + * To reduce noise (e.g. when the RTT changes often), this is only + * done when delta is smaller than RTT/2. + */ + if (last_x_recv > 0 && delta < last_rtt/2) { + tfrc_pr_debug("delta < RTT ==> %ld us < %u us\n", + (long)delta, (unsigned)last_rtt); + + delta = (bytes ? delta : 0) + last_rtt; + bytes += div_u64((u64)last_x_recv * last_rtt, USEC_PER_SEC); + } + + if (unlikely(bytes == 0)) { + DCCP_WARN("X_recv == 0, using old value of %u\n", last_x_recv); + return last_x_recv; + } + return scaled_div32(bytes, delta); +} +EXPORT_SYMBOL_GPL(tfrc_rx_hist_x_recv); + void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) { int i; diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index ba5832bbc348..6552be63cb0a 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -93,7 +93,8 @@ struct tfrc_rx_hist_entry { * @rtt_sample_prev: Used during RTT sampling, points to candidate entry * @rtt_estimate: Receiver RTT estimate * @packet_size: Packet size in bytes (as per RFC 3448, 3.1) - * @bytes_recvd: Number of bytes received since last sending feedback + * @bytes_recvd: Number of bytes received since @bytes_start + * @bytes_start: Start time for counting @bytes_recvd */ struct tfrc_rx_hist { struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1]; @@ -105,6 +106,7 @@ struct tfrc_rx_hist { /* Receiver sampling of application payload lengths */ u32 packet_size, bytes_recvd; + ktime_t bytes_start; }; /** @@ -169,6 +171,15 @@ static inline u32 tfrc_rx_hist_rtt(const struct tfrc_rx_hist *h) return h->rtt_estimate; } +static inline void tfrc_rx_hist_restart_byte_counter(struct tfrc_rx_hist *h) +{ + h->bytes_recvd = 0; + h->bytes_start = ktime_get_real(); +} + +extern u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv); + + extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb, const u64 ndp); From 88e97a93342c0b9e835d510921e7b2df8547d1bd Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 100/105] dccp ccid-3: Update the RX history records in one place This patch is a requirement for enabling ECN support later on. With that change in mind, the following preparations are done: * renamed handle_loss() into congestion_event() since it returns true when a congestion event happens (it will eventually also take care of ECN packets); * lets tfrc_rx_congestion_event() always update the RX history records, since this routine needs to be called for each non-duplicate packet anyway; * made all involved boolean-type functions to have return type `bool'; Updating the RX history records is now only necessary for the packets received up to sending the first feedback. The receiver code becomes again simpler. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 37 +++++++++-------------------- net/dccp/ccids/lib/loss_interval.c | 10 ++++---- net/dccp/ccids/lib/loss_interval.h | 2 +- net/dccp/ccids/lib/packet_history.c | 37 ++++++++++++++--------------- net/dccp/ccids/lib/packet_history.h | 10 ++++---- 5 files changed, 40 insertions(+), 56 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index f2f9514dbad2..aca072b3edae 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -657,41 +657,26 @@ failed: static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE; const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp; const bool is_data_packet = dccp_data_packet(skb); /* * Perform loss detection and handle pending losses */ - if (tfrc_rx_handle_loss(&hcrx->hist, &hcrx->li_hist, - skb, ndp, ccid3_first_li, sk)) { - do_feedback = CCID3_FBACK_PARAM_CHANGE; - goto done_receiving; - } - - if (unlikely(hcrx->feedback == CCID3_FBACK_NONE)) { - if (is_data_packet) - do_feedback = CCID3_FBACK_INITIAL; - goto update_records; - } - - if (tfrc_rx_hist_loss_pending(&hcrx->hist)) - return; /* done receiving */ - + if (tfrc_rx_congestion_event(&hcrx->hist, &hcrx->li_hist, + skb, ndp, ccid3_first_li, sk)) + ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PARAM_CHANGE); + /* + * Feedback for first non-empty data packet (RFC 3448, 6.3) + */ + else if (unlikely(hcrx->feedback == CCID3_FBACK_NONE && is_data_packet)) + ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_INITIAL); /* * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3 */ - if (is_data_packet && - SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3) - do_feedback = CCID3_FBACK_PERIODIC; - -update_records: - tfrc_rx_hist_add_packet(&hcrx->hist, skb, ndp); - -done_receiving: - if (do_feedback) - ccid3_hc_rx_send_feedback(sk, skb, do_feedback); + else if (!tfrc_rx_hist_loss_pending(&hcrx->hist) && is_data_packet && + SUB16(dccp_hdr(skb)->dccph_ccval, hcrx->last_counter) > 3) + ccid3_hc_rx_send_feedback(sk, skb, CCID3_FBACK_PERIODIC); } static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk) diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index fe5c2a31c770..b1ae8f8259e5 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -140,18 +140,18 @@ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, * @sk: Used by @calc_first_li in caller-specific way (subtyping) * Updates I_mean and returns 1 if a new interval has in fact been added to @lh. */ -int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, - u32 (*calc_first_li)(struct sock *), struct sock *sk) +bool tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, + u32 (*calc_first_li)(struct sock *), struct sock *sk) { struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new; if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh))) - return 0; + return false; new = tfrc_lh_demand_next(lh); if (unlikely(new == NULL)) { DCCP_CRIT("Cannot allocate/add loss record."); - return 0; + return false; } new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno; @@ -169,7 +169,7 @@ int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, tfrc_lh_calc_i_mean(lh); } - return 1; + return true; } EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h index f101ae2cf528..d08a226db43e 100644 --- a/net/dccp/ccids/lib/loss_interval.h +++ b/net/dccp/ccids/lib/loss_interval.h @@ -67,7 +67,7 @@ static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh) struct tfrc_rx_hist; -extern int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, +extern bool tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *, u32 (*first_li)(struct sock *), struct sock *); extern void tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *); extern void tfrc_lh_cleanup(struct tfrc_loss_hist *lh); diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 547ad098ea6a..cce9f03bda3e 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -192,10 +192,8 @@ static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1) u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, s1 = DCCP_SKB_CB(skb)->dccpd_seq; - if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */ + if (!dccp_loss_free(s0, s1, n1)) /* gap between S0 and S1 */ h->loss_count = 1; - tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1); - } } static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2) @@ -328,13 +326,13 @@ static void __three_after_loss(struct tfrc_rx_hist *h) } /** - * tfrc_rx_handle_loss - Loss detection and further processing - * @h: The non-empty RX history object - * @lh: Loss Intervals database to update - * @skb: Currently received packet - * @ndp: The NDP count belonging to @skb - * @calc_first_li: Caller-dependent computation of first loss interval in @lh - * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) + * tfrc_rx_congestion_event - Loss detection and further processing + * @h: The non-empty RX history object + * @lh: Loss Intervals database to update + * @skb: Currently received packet + * @ndp: The NDP count belonging to @skb + * @first_li: Caller-dependent computation of first loss interval in @lh + * @sk: Used by @calc_first_li (see tfrc_lh_interval_add) * Chooses action according to pending loss, updates LI database when a new * loss was detected, and does required post-processing. Returns 1 when caller * should send feedback, 0 otherwise. @@ -342,12 +340,12 @@ static void __three_after_loss(struct tfrc_rx_hist *h) * records accordingly, the caller should not perform any more RX history * operations when loss_count is greater than 0 after calling this function. */ -int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, - struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*calc_first_li)(struct sock *), struct sock *sk) +bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, + struct tfrc_loss_hist *lh, + struct sk_buff *skb, const u64 ndp, + u32 (*first_li)(struct sock *), struct sock *sk) { - int is_new_loss = 0; + bool new_event = false; if (tfrc_rx_hist_duplicate(h, skb)) return 0; @@ -355,6 +353,7 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, if (h->loss_count == 0) { __do_track_loss(h, skb, ndp); tfrc_rx_hist_sample_rtt(h, skb); + tfrc_rx_hist_add_packet(h, skb, ndp); } else if (h->loss_count == 1) { __one_after_loss(h, skb, ndp); } else if (h->loss_count != 2) { @@ -363,7 +362,7 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, /* * Update Loss Interval database and recycle RX records */ - is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk); + new_event = tfrc_lh_interval_add(lh, h, first_li, sk); __three_after_loss(h); } @@ -378,12 +377,12 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, } /* RFC 3448, 6.1: update I_0, whose growth implies p <= p_prev */ - if (!is_new_loss) + if (!new_event) tfrc_lh_update_i_mean(lh, skb); - return is_new_loss; + return new_event; } -EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss); +EXPORT_SYMBOL_GPL(tfrc_rx_congestion_event); /* Compute the sending rate X_recv measured between feedback intervals */ u32 tfrc_rx_hist_x_recv(struct tfrc_rx_hist *h, const u32 last_x_recv) diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 6552be63cb0a..555e65cd73a0 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -186,11 +186,11 @@ extern void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, extern int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb); struct tfrc_loss_hist; -extern int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, - struct tfrc_loss_hist *lh, - struct sk_buff *skb, const u64 ndp, - u32 (*first_li)(struct sock *sk), - struct sock *sk); +extern bool tfrc_rx_congestion_event(struct tfrc_rx_hist *h, + struct tfrc_loss_hist *lh, + struct sk_buff *skb, const u64 ndp, + u32 (*first_li)(struct sock *sk), + struct sock *sk); extern void tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb); extern int tfrc_rx_hist_init(struct tfrc_rx_hist *h, struct sock *sk); From 9d497a2c9120e31ff417e75f9f5576c4cde11281 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 101/105] dccp ccid-3: Implement rfc3448bis change to initial-rate computation The patch updates CCID-3 with regard to the latest rfc3448bis-06: * in the first revisions of the draft, MSS was used for the RFC 3390 window; * then (from revision #1 to revision #2), it used the packet size `s'; * now, in this revision (and apparently final), the value is back to MSS. This change has an implication for the case when no RTT sample is available, at the time of sending the first packet: * with RTT sample, 2*MSS/RTT <= initial_rate <= 4*MSS/RTT; * without RTT sample, the initial rate is one packet (s bytes) per second (sec. 4.2), but using s instead of MSS here creates an imbalance, since this would further reduce the initial sending rate. Hence the patch uses MSS (called MPS in RFC 4340) in all places. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index aca072b3edae..d654264c5108 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -53,18 +53,16 @@ static int ccid3_debug; /* * Compute the initial sending rate X_init in the manner of RFC 3390: * - * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT + * X_init = min(4 * MPS, max(2 * MPS, 4380 bytes)) / RTT * - * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis - * (rev-02) clarifies the use of RFC 3390 with regard to the above formula. * For consistency with other parts of the code, X_init is scaled by 2^6. */ static inline u64 rfc3390_initial_rate(struct sock *sk) { - const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - const __u32 w_init = clamp_t(__u32, 4380U, 2 * hctx->s, 4 * hctx->s); + const u32 mps = dccp_sk(sk)->dccps_mss_cache, + w_init = clamp(4380U, 2 * mps, 4 * mps); - return scaled_div(w_init << 6, hctx->rtt); + return scaled_div(w_init << 6, ccid3_hc_tx_sk(sk)->rtt); } /** @@ -293,7 +291,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) * - set sending rate X_pps = 1pps as per RFC 3448, 4.2. */ hctx->rtt = DCCP_FALLBACK_RTT; - hctx->x = hctx->s; + hctx->x = dp->dccps_mss_cache; hctx->x <<= 6; } ccid3_update_send_interval(hctx); From 891e4d8a402427bc40dee4c8413213a584710372 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 102/105] dccp ccid-3: Tidy up CCID-Kconfig dependencies The per-CCID menu has several dependencies on EXPERIMENTAL. These are redundant, since net/dccp/ccids/Kconfig is sourced by net/dccp/Kconfig and since the latter menu in turn asserts a dependency on EXPERIMENTAL. The patch removes the redundant dependencies as well as the repeated reference within the sub-menu. Further changes: ---------------- Two single dependencies on CCID-3 are replaced with a single enclosing `if'. Signed-off-by: Gerrit Renker --- net/dccp/ccids/Kconfig | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 44c7e90248fa..dc973abe03b4 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -1,8 +1,7 @@ menu "DCCP CCIDs Configuration (EXPERIMENTAL)" - depends on EXPERIMENTAL config IP_DCCP_CCID2 - tristate "CCID2 (TCP-Like) (EXPERIMENTAL)" + tristate "CCID2 (TCP-Like)" def_tristate IP_DCCP ---help--- CCID 2, TCP-like Congestion Control, denotes Additive Increase, @@ -35,7 +34,7 @@ config IP_DCCP_CCID2_DEBUG If in doubt, say N. config IP_DCCP_CCID3 - tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)" + tristate "CCID3 (TCP-Friendly)" def_tristate IP_DCCP select IP_DCCP_TFRC_LIB ---help--- @@ -63,9 +62,9 @@ config IP_DCCP_CCID3 If in doubt, say M. +if IP_DCCP_CCID3 config IP_DCCP_CCID3_DEBUG bool "CCID3 debugging messages" - depends on IP_DCCP_CCID3 ---help--- Enable CCID3-specific debugging messages. @@ -78,7 +77,6 @@ config IP_DCCP_CCID3_DEBUG config IP_DCCP_CCID3_RTO int "Use higher bound for nofeedback timer" default 100 - depends on IP_DCCP_CCID3 && EXPERIMENTAL ---help--- Use higher lower bound for nofeedback timer expiration. @@ -105,6 +103,7 @@ config IP_DCCP_CCID3_RTO The purpose of the nofeedback timer is to slow DCCP down when there is serious network congestion: experimenting with larger values should therefore not be performed on WANs. +endif # IP_DCCP_CCID3 config IP_DCCP_TFRC_LIB tristate From c8f41d50adc380bfb38538ce39ca0ffea5926221 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 103/105] dccp ccid-3: Measuring the packet size s with regard to rfc3448bis-06 rfc3448bis allows three different ways of tracking the packet size `s': 1. using the MSS/MPS (at initialisation, 4.2, and in 4.1 (1)); 2. using the average of `s' (in 4.1); 3. using the maximum of `s' (in 4.2). Instead of hard-coding a single interpretation of rfc3448bis, this implements a choice of all three alternatives and suggests the first as default, since it is the option which is most consistent with other parts of the specification. The patch further deprecates the update of t_ipi whenever `s' changes. The gains of doing this are only small since a change of s takes effect at the next instant X is updated: * when the next feedback comes in (within one RTT or less); * when the nofeedback timer expires (within at most 4 RTTs). Further, there are complications caused by updating t_ipi whenever s changes: * if t_ipi had previously been updated to effect oscillation prevention (4.5), then it is impossible to make the same adjustment to t_ipi again, thus counter-acting the algorithm; * s may be updated any time and a modification of t_ipi depends on the current state (e.g. no oscillation prevention is done in the absence of feedback); * in rev-06 of rfc3448bis, there are more possible cases, depending on whether the sender is in slow-start (t_ipi <= R/W_init), or in congestion-avoidance, limited by X_recv or the throughput equation (t_ipi <= t_mbi). Thus there are side effects of always updating t_ipi as s changes. These may not be desirable. The only case I can think of where such an update makes sense is to recompute X_calc when p > 0 and when s changes (not done by this patch). Signed-off-by: Gerrit Renker --- net/dccp/ccids/Kconfig | 20 ++++++++++++++++++++ net/dccp/ccids/ccid3.c | 27 +++++++++++++++------------ 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index dc973abe03b4..fb168be2cb43 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -74,6 +74,26 @@ config IP_DCCP_CCID3_DEBUG If in doubt, say N. +choice + prompt "Select method for measuring the packet size s" + default IP_DCCP_CCID3_MEASURE_S_AS_MPS + +config IP_DCCP_CCID3_MEASURE_S_AS_MPS + bool "Always use MPS in place of s" + ---help--- + This use is recommended as it is consistent with the initialisation + of X and suggested when s varies (rfc3448bis, (1) in section 4.1). +config IP_DCCP_CCID3_MEASURE_S_AS_AVG + bool "Use moving average" + ---help--- + An alternative way of tracking s, also supported by rfc3448bis. + This used to be the default for CCID-3 in previous kernels. +config IP_DCCP_CCID3_MEASURE_S_AS_MAX + bool "Track the maximum payload length" + ---help--- + An experimental method based on tracking the maximum packet size. +endchoice + config IP_DCCP_CCID3_RTO int "Use higher bound for nofeedback timer" default 100 diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index d654264c5108..d77d3e664b7e 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -136,17 +136,18 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) } /* - * Track the mean packet size `s' (cf. RFC 4342, 5.3 and RFC 3448, 4.1) - * @len: DCCP packet payload size in bytes + * ccid3_hc_tx_measure_packet_size - Measuring the packet size `s' (sec 4.1) + * @new_len: DCCP payload size in bytes (not used by all methods) */ -static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) +static u32 ccid3_hc_tx_measure_packet_size(struct sock *sk, const u16 new_len) { - const u16 old_s = hctx->s; - - hctx->s = tfrc_ewma(hctx->s, len, 9); - - if (hctx->s != old_s) - ccid3_update_send_interval(hctx); +#if defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_AVG) + return tfrc_ewma(ccid3_hc_tx_sk(sk)->s, new_len, 9); +#elif defined(CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MAX) + return max(ccid3_hc_tx_sk(sk)->s, new_len); +#else /* CONFIG_IP_DCCP_CCID3_MEASURE_S_AS_MPS */ + return dccp_sk(sk)->dccps_mss_cache; +#endif } /* @@ -271,8 +272,6 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) /* Set t_0 for initial packet */ hctx->t_nom = now; - hctx->s = skb->len; - /* * Use initial RTT sample when available: recommended by erratum * to RFC 4342. This implements the initialisation procedure of @@ -294,6 +293,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) hctx->x = dp->dccps_mss_cache; hctx->x <<= 6; } + + /* Compute t_ipi = s / X */ + hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len); ccid3_update_send_interval(hctx); } else { @@ -326,7 +328,8 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) { struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); - ccid3_hc_tx_update_s(hctx, len); + /* Changes to s will become effective the next time X is computed */ + hctx->s = ccid3_hc_tx_measure_packet_size(sk, len); if (tfrc_tx_hist_add(&hctx->hist, dccp_sk(sk)->dccps_gss)) DCCP_CRIT("packet history - out of memory!"); From 53ac9570c8145710aaed9e1eb850c2e991a4ebc1 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 104/105] dccp ccid-3: Simplify computing and range-checking of t_ipi This patch simplifies the computation of t_ipi, avoiding expensive computations to enforce the minimum sending rate. Both RFC 3448 and rfc3448bis (revision #06), as well as RFC 4342 sec 5., require at various stages that at least one packet must be sent per t_mbi = 64 seconds. This requires frequent divisions of the type X_min = s/t_mbi, which are later converted back into an inter-packet-interval t_ipi_max = s/X_min = t_mbi. The patch removes the expensive indirection; in the unlikely case of having a sending rate less than one packet per 64 seconds, it also re-adjusts X. The following cases document conformance with RFC 3448 / rfc3448bis-06: 1) Time until receiving the first feedback packet: * if the sender has no initial RTT sample then X = s/1 Bps > s/t_mbi; * if the sender has an initial RTT sample or when the first feedback packet is received, X = W_init/R > s/t_mbi. 2) Slow-start (p == 0 and feedback packets come in): * RFC 3448 (current code) enforces a minimum of s/R > s/t_mbi; * rfc3448bis (future code) enforces an even higher minimum of W_init/R. 3) Congestion avoidance with no absence of feedback (p > 0): * when X_calc or X_recv/2 are too low, the minimum of X_min = s/t_mbi is enforced in update_x() when calling update_send_interval(); * update_send_interval() is, as before, only called when X changes (i.e. either when increasing or decreasing, not when in equilibrium). 4) Reduction of X without prior feedback or during slow-start (p==0): * both RFC 3448 and rfc3448bis here halve X directly; * the associated constraint X >= s/t_mbi is nforced here by send_interval(). 5) Reduction of X when p > 0: * X is modified indirectly via X_recv (RFC 3448) or X_recv_set (rfc3448bis); * in both cases, control goes back to section 4.3 (in both documents); * since p > 0, both documents use X = max(min(...), s/t_mbi), which is enforced in this patch by calling send_interval() from update_x(). I think that this analysis is exhaustive. Should I have forgotten a case, the worst-case consideration arises when X sinks below s/t_mbi, and is then increased back up to this minimum value. Even under this assumption, the behaviour is correct, since all lower limits of X in RFC 3448 / rfc3448bis are either equal to or greater than s/t_mbi. Note on the condition X >= s/t_mbi <==> t_ipi = s/X <= t_mbi: since X is scaled by 64, and all time units are in microseconds, the coded condition is: t_ipi = s * 64 * 10^6 usec / X <= 64 * 10^6 usec This simplifies to s / X <= 1 second <==> X * 1 second >= s > 0. (A zero `s' is not allowed by the CCID-3 code). Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index d77d3e664b7e..7cd76c6c790c 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -66,15 +66,15 @@ static inline u64 rfc3390_initial_rate(struct sock *sk) } /** - * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst - * This respects the granularity of X_inst (64 * bytes/second). + * ccid3_update_send_interval - Calculate new t_ipi = s / X + * This respects the granularity of X (64 * bytes/second) and enforces the + * scaled minimum of s * 64 / t_mbi = `s' bytes/second as per RFC 3448/4342. */ static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hctx) { + if (unlikely(hctx->x <= hctx->s)) + hctx->x = hctx->s; hctx->t_ipi = scaled_div32(((u64)hctx->s) << 6, hctx->x); - - ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hctx->t_ipi, - hctx->s, (unsigned)(hctx->x >> 6)); } static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hctx, ktime_t now) @@ -115,7 +115,6 @@ static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp) if (hctx->p > 0) { hctx->x = min(((u64)hctx->x_calc) << 6, min_rate); - hctx->x = max(hctx->x, (((u64)hctx->s) << 6) / TFRC_T_MBI); } else if (ktime_us_delta(now, hctx->t_ld) - (s64)hctx->rtt >= 0) { @@ -197,8 +196,9 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) if (hctx->t_rto == 0 || hctx->p == 0) { /* halve send rate directly */ - hctx->x = max(hctx->x / 2, (((u64)hctx->s) << 6) / TFRC_T_MBI); + hctx->x /= 2; ccid3_update_send_interval(hctx); + } else { /* * Modify the cached value of X_recv @@ -213,9 +213,7 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) BUG_ON(hctx->p && !hctx->x_calc); if (hctx->x_calc > (hctx->x_recv >> 5)) - hctx->x_recv = - max(hctx->x_recv / 2, - (((__u64)hctx->s) << 6) / (2 * TFRC_T_MBI)); + hctx->x_recv /= 2; else { hctx->x_recv = hctx->x_calc; hctx->x_recv <<= 4; From a3cbdde8e9c38b66b4f13ac5d6ff1939ded0ff20 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Thu, 4 Sep 2008 07:30:19 +0200 Subject: [PATCH 105/105] dccp ccid-3: Preventing Oscillations This implements [RFC 3448, 4.5], which performs congestion avoidance behaviour by reducing the transmit rate as the queueing delay (measured in terms of long-term RTT) increases. Oscillation can be turned on/off via a module option (do_osc_prev) and via sysfs (using mode 0644), the default is off. Overflow analysis: ------------------ * oscillation prevention is done after update_x(), so that t_ipi <= 64000; * hence the multiplication "t_ipi * sqrt(R_sample)" needs 64 bits; * done using u64 for sqrt_sample and explicit typecast of t_ipi; * the divisor, R_sqmean, is non-zero because oscillation prevention is first called when receiving the second feedback packet, and tfrc_scaled_rtt() > 0. A detailed discussion of the algorithm (with plots) is on http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/ccid3/sender_notes/oscillation_prevention/ The algorithm has negative side effects: * when allowing to decrease t_ipi (leads to a large RTT) and * when using it during slow-start; both uses are therefore disabled. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 40 +++++++++++++++++++++++++++++++++++++++ net/dccp/ccids/ccid3.h | 6 ++++-- net/dccp/ccids/lib/tfrc.h | 15 +++++++++++++++ 3 files changed, 59 insertions(+), 2 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 7cd76c6c790c..06cfdad84a6a 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -49,6 +49,8 @@ static int ccid3_debug; /* * Transmitter Half-Connection Routines */ +/* Oscillation Prevention/Reduction: recommended by rfc3448bis, on by default */ +static int do_osc_prev = true; /* * Compute the initial sending rate X_init in the manner of RFC 3390: @@ -296,6 +298,9 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) hctx->s = ccid3_hc_tx_measure_packet_size(sk, skb->len); ccid3_update_send_interval(hctx); + /* Seed value for Oscillation Prevention (sec. 4.5) */ + hctx->r_sqmean = tfrc_scaled_sqrt(hctx->rtt); + } else { delay = ktime_us_delta(hctx->t_nom, now); ccid3_pr_debug("delay=%ld\n", (long)delay); @@ -400,6 +405,38 @@ done_computing_x: hctx->s, hctx->p, hctx->x_calc, (unsigned)(hctx->x_recv >> 6), (unsigned)(hctx->x >> 6)); + /* + * Oscillation Reduction (RFC 3448, 4.5) - modifying t_ipi according to + * RTT changes, multiplying by X/X_inst = sqrt(R_sample)/R_sqmean. This + * can be useful if few connections share a link, avoiding that buffer + * fill levels (RTT) oscillate as a result of frequent adjustments to X. + * A useful presentation with background information is in + * Joerg Widmer, "Equation-Based Congestion Control", + * MSc Thesis, University of Mannheim, Germany, 2000 + * (sec. 3.6.4), who calls this ISM ("Inter-packet Space Modulation"). + */ + if (do_osc_prev) { + r_sample = tfrc_scaled_sqrt(r_sample); + /* + * The modulation can work in both ways: increase/decrease t_ipi + * according to long-term increases/decreases of the RTT. The + * former is a useful measure, since it works against queue + * build-up. The latter temporarily increases the sending rate, + * so that buffers fill up more quickly. This in turn causes + * the RTT to increase, so that either later reduction becomes + * necessary or the RTT stays at a very high level. Decreasing + * t_ipi is therefore not supported. + * Furthermore, during the initial slow-start phase the RTT + * naturally increases, where using the algorithm would cause + * delays. Hence it is disabled during the initial slow-start. + */ + if (r_sample > hctx->r_sqmean && hctx->p > 0) + hctx->t_ipi = div_u64((u64)hctx->t_ipi * (u64)r_sample, + hctx->r_sqmean); + hctx->t_ipi = min_t(u32, hctx->t_ipi, TFRC_T_MBI); + /* update R_sqmean _after_ computing the modulation factor */ + hctx->r_sqmean = tfrc_ewma(hctx->r_sqmean, r_sample, 9); + } /* unschedule no feedback timer */ sk_stop_timer(sk, &hctx->no_feedback_timer); @@ -749,6 +786,9 @@ static struct ccid_operations ccid3 = { .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt, }; +module_param(do_osc_prev, bool, 0644); +MODULE_PARM_DESC(do_osc_prev, "Use Oscillation Prevention (RFC 3448, 4.5)"); + #ifdef CONFIG_IP_DCCP_CCID3_DEBUG module_param(ccid3_debug, bool, 0644); MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 342235c57bf3..af6e1bf937d9 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -47,8 +47,8 @@ /* Two seconds as per RFC 3448 4.2 */ #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) -/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ -#define TFRC_T_MBI 64 +/* Maximum backoff interval t_mbi (RFC 3448, 4.3) */ +#define TFRC_T_MBI (64 * USEC_PER_SEC) /* * The t_delta parameter (RFC 3448, 4.6): delays of less than %USEC_PER_MSEC are @@ -76,6 +76,7 @@ enum ccid3_options { * @x_recv - Receive rate in 64 * bytes per second * @x_calc - Calculated rate in bytes per second * @rtt - Estimate of current round trip time in usecs + * @r_sqmean - Estimate of long-term RTT (RFC 3448, 4.5) * @p - Current loss event rate (0-1) scaled by 1000000 * @s - Packet size in bytes * @t_rto - Nofeedback Timer setting in usecs @@ -94,6 +95,7 @@ struct ccid3_hc_tx_sock { u64 x_recv; u32 x_calc; u32 rtt; + u16 r_sqmean; u32 p; u32 t_rto; u32 t_ipi; diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h index bb47146ac7d1..ede12f53de5a 100644 --- a/net/dccp/ccids/lib/tfrc.h +++ b/net/dccp/ccids/lib/tfrc.h @@ -47,6 +47,21 @@ static inline u32 scaled_div32(u64 a, u64 b) return result; } +/** + * tfrc_scaled_sqrt - Compute scaled integer sqrt(x) for 0 < x < 2^22-1 + * Uses scaling to improve accuracy of the integer approximation of sqrt(). The + * scaling factor of 2^10 limits the maximum @sample to 4e6; this is okay for + * clamped RTT samples (dccp_sample_rtt). + * Should best be used for expressions of type sqrt(x)/sqrt(y), since then the + * scaling factor is neutralised. For this purpose, it avoids returning zero. + */ +static inline u16 tfrc_scaled_sqrt(const u32 sample) +{ + const unsigned long non_zero_sample = sample ? : 1; + + return int_sqrt(non_zero_sample << 10); +} + /** * tfrc_ewma - Exponentially weighted moving average * @weight: Weight to be used as damping factor, in units of 1/10