From 8a508ac26e72aa203677aa6a8464bd3ea44216a6 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 3 Dec 2006 14:50:23 -0200 Subject: [PATCH 01/12] [DCCP]: Use higher RTO default for CCID3 The TFRC nofeedback timer normally expires after the maximum of 4 RTTs and twice the current send interval (RFC 3448, 4.3). On LANs with a small RTT this can mean a high processing load and reduced performance, since then the nofeedback timer is triggered very frequently. This patch provides a configuration option to set the bound for the nofeedback timer, using as default 100 milliseconds. By setting the configuration option to 0, strict RFC 3448 behaviour can be enforced for the nofeedback timer. Signed-off-by: Gerrit Renker Signed-off-by: Arnaldo Carvalho de Melo --- net/dccp/ccids/Kconfig | 33 +++++++++++++++++++++++++++++++++ net/dccp/ccids/ccid3.c | 21 +++++++++++++-------- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index dac89166eb1..80f46988769 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -89,4 +89,37 @@ config IP_DCCP_CCID3_DEBUG parameter to 0 or 1. If in doubt, say N. + +config IP_DCCP_CCID3_RTO + int "Use higher bound for nofeedback timer" + default 100 + depends on IP_DCCP_CCID3 && EXPERIMENTAL + ---help--- + Use higher lower bound for nofeedback timer expiration. + + The TFRC nofeedback timer normally expires after the maximum of 4 + RTTs and twice the current send interval (RFC 3448, 4.3). On LANs + with a small RTT this can mean a high processing load and reduced + performance, since then the nofeedback timer is triggered very + frequently. + + This option enables to set a higher lower bound for the nofeedback + value. Values in units of milliseconds can be set here. + + A value of 0 disables this feature by enforcing the value specified + in RFC 3448. The following values have been suggested as bounds for + experimental use: + * 16-20ms to match the typical multimedia inter-frame interval + * 100ms as a reasonable compromise [default] + * 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4) + + The default of 100ms is a compromise between a large value for + efficient DCCP implementations, and a small value to avoid disrupting + the network in times of congestion. + + The purpose of the nofeedback timer is to slow DCCP down when there + is serious network congestion: experimenting with larger values should + therefore not be performed on WANs. + + endmenu diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 70ebe705eb7..99807783a22 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -245,9 +245,10 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) } /* * Schedule no feedback timer to expire in - * max(4 * R, 2 * s/X) = max(4 * R, 2 * t_ipi) + * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) + * See comments in packet_recv() regarding the value of t_RTO. */ - t_nfb = max(4 * hctx->ccid3hctx_rtt, 2 * hctx->ccid3hctx_t_ipi); + t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); break; case TFRC_SSTATE_NO_SENT: DCCP_BUG("Illegal %s state NO_SENT, sk=%p", dccp_role(sk), sk); @@ -512,16 +513,20 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) */ sk->sk_write_space(sk); - /* Update timeout interval. We use the alternative variant of - * [RFC 3448, 3.1] which sets the upper bound of t_rto to one - * second, as it is suggested for TCP (see RFC 2988, 2.4). */ + /* + * Update timeout interval for the nofeedback timer. + * We use a configuration option to increase the lower bound. + * This can help avoid triggering the nofeedback timer too often + * ('spinning') on LANs with small RTTs. + */ hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt, - USEC_PER_SEC ); + CONFIG_IP_DCCP_CCID3_RTO * + (USEC_PER_SEC/1000) ); /* * Schedule no feedback timer to expire in - * max(4 * R, 2 * s/X) = max(4 * R, 2 * t_ipi) + * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) */ - t_nfb = max(4 * hctx->ccid3hctx_rtt, 2 * hctx->ccid3hctx_t_ipi); + t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi); ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to " "expire in %lu jiffies (%luus)\n", From 76d127779e51fbf67ad6793214369aa1fea90278 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 3 Dec 2006 14:50:42 -0200 Subject: [PATCH 02/12] [DCCP]: Fix BUG in retransmission delay calculation This bug resulted in ccid3_hc_tx_send_packet returning negative delay values, which in turn triggered silently dequeueing packets in dccp_write_xmit. As a result, only a few out of the submitted packets made it at all onto the network. Occasionally, when dccp_wait_for_ccid was involved, this also triggered a bug warning since ccid3_hc_tx_send_packet returned a negative value (which in reality was a negative delay value). The cause for this bug lies in the comparison if (delay >= hctx->ccid3hctx_delta) return delay / 1000L; The type of `delay' is `long', that of ccid3hctx_delta is `u32'. When comparing negative long values against u32 values, the test returned `true' whenever delay was smaller than 0 (meaning the packet was overdue to send). The fix is by casting, subtracting, and then testing the difference with regard to 0. This has been tested and shown to work. Signed-off-by: Gerrit Renker Signed-off-by: Ian McDonald Signed-off-by: Arnaldo Carvalho de Melo --- net/dccp/ccids/ccid3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 99807783a22..22a07248c24 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -339,7 +339,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) * else * // send the packet in (t_nom - t_now) milliseconds. */ - if (delay >= hctx->ccid3hctx_delta) + if (delay - (long)hctx->ccid3hctx_delta >= 0) return delay / 1000L; break; case TFRC_SSTATE_TERM: From 5c3fbb6acf9d32772ec7fc01cedd9478d0e26f44 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 3 Dec 2006 14:50:56 -0200 Subject: [PATCH 03/12] [DCCP] ccid3: Fix bug in calculation of send rate The main object of this patch is the following bug: ==> In ccid3_hc_tx_packet_recv, the parameters p and X_recv were updated _after_ the send rate was calculated. This is clearly an error and is resolved by re-ordering statements. In addition, * r_sample is converted from u32 to long to check whether the time difference was negative (it would otherwise be converted to a large u32 value) * protection against RTT=0 (this is possible) is provided in a further patch * t_elapsed is also converted to long, to match the type of r_sample * adds a a more debugging information regarding current send rates * various trivial comment/documentation updates Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: Arnaldo Carvalho de Melo --- net/dccp/ccids/ccid3.c | 89 ++++++++++++++++++++++++------------------ 1 file changed, 52 insertions(+), 37 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 22a07248c24..bd353044c54 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -121,12 +121,15 @@ static inline void ccid3_update_send_time(struct ccid3_hc_tx_sock *hctx) /* * Update X by * If (p > 0) - * x_calc = calcX(s, R, p); + * X_calc = calcX(s, R, p); * X = max(min(X_calc, 2 * X_recv), s / t_mbi); * Else * If (now - tld >= R) * X = max(min(2 * X, 2 * X_recv), s / R); * tld = now; + * + * If X has changed, we also update the scheduled send time t_now, + * the inter-packet interval t_ipi, and the delta value. */ static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now) @@ -413,10 +416,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) struct dccp_tx_hist_entry *packet; struct timeval now; unsigned long t_nfb; - u32 t_elapsed; u32 pinv; - u32 x_recv; - u32 r_sample; + long r_sample, t_elapsed; BUG_ON(hctx == NULL); @@ -427,31 +428,51 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) opt_recv = &hctx->ccid3hctx_options_received; - t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10; - x_recv = opt_recv->ccid3or_receive_rate; - pinv = opt_recv->ccid3or_loss_event_rate; - switch (hctx->ccid3hctx_state) { case TFRC_SSTATE_NO_FBACK: case TFRC_SSTATE_FBACK: - /* Calculate new round trip sample by - * R_sample = (now - t_recvdata) - t_delay */ - /* get t_recvdata from history */ + /* get packet from history to look up t_recvdata */ packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist, DCCP_SKB_CB(skb)->dccpd_ack_seq); if (unlikely(packet == NULL)) { - DCCP_WARN("%s, sk=%p, seqno %llu(%s) does't exist " + DCCP_WARN("%s(%p), seqno %llu(%s) doesn't exist " "in history!\n", dccp_role(sk), sk, (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq, dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type)); return; } - /* Update RTT */ + /* Update receive rate */ + hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate; + + /* Update loss event rate */ + pinv = opt_recv->ccid3or_loss_event_rate; + if (pinv == ~0U || pinv == 0) + hctx->ccid3hctx_p = 0; + else { + hctx->ccid3hctx_p = 1000000 / pinv; + + if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) { + hctx->ccid3hctx_p = TFRC_SMALLEST_P; + ccid3_pr_debug("%s, sk=%p, Smallest p used!\n", + dccp_role(sk), sk); + } + } + dccp_timestamp(sk, &now); - r_sample = timeval_delta(&now, &packet->dccphtx_tstamp); - if (unlikely(r_sample <= t_elapsed)) - DCCP_WARN("r_sample=%uus,t_elapsed=%uus\n", + + /* + * Calculate new round trip sample as per [RFC 3448, 4.3] by + * R_sample = (now - t_recvdata) - t_elapsed + */ + r_sample = timeval_delta(&now, &packet->dccphtx_tstamp); + t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10; + + if (unlikely(r_sample <= 0)) { + DCCP_WARN("WARNING: R_sample (%ld) <= 0!\n", r_sample); + r_sample = 0; + } else if (unlikely(r_sample <= t_elapsed)) + DCCP_WARN("WARNING: r_sample=%ldus <= t_elapsed=%ldus\n", r_sample, t_elapsed); else r_sample -= t_elapsed; @@ -474,31 +495,25 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hctx->ccid3hctx_t_ld = now; ccid3_update_send_time(hctx); + + ccid3_pr_debug("%s(%p), s=%u, w_init=%u, " + "R_sample=%ldus, X=%u\n", dccp_role(sk), + sk, hctx->ccid3hctx_s, w_init, r_sample, + hctx->ccid3hctx_x); + ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK); } else { - hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 + - r_sample / 10; + hctx->ccid3hctx_rtt = (9 * hctx->ccid3hctx_rtt + + (u32)r_sample ) / 10; + ccid3_hc_tx_update_x(sk, &now); - } - ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, " - "r_sample=%us\n", dccp_role(sk), sk, - hctx->ccid3hctx_rtt, r_sample); - - /* Update receive rate */ - hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */ - - /* Update loss event rate */ - if (pinv == ~0 || pinv == 0) - hctx->ccid3hctx_p = 0; - else { - hctx->ccid3hctx_p = 1000000 / pinv; - - if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) { - hctx->ccid3hctx_p = TFRC_SMALLEST_P; - ccid3_pr_debug("%s, sk=%p, Smallest p used!\n", - dccp_role(sk), sk); - } + ccid3_pr_debug("%s(%p), RTT=%uus (sample=%ldus), s=%u, " + "p=%u, X_calc=%u, X=%u\n", dccp_role(sk), + sk, hctx->ccid3hctx_rtt, r_sample, + hctx->ccid3hctx_s, hctx->ccid3hctx_p, + hctx->ccid3hctx_x_calc, + hctx->ccid3hctx_x); } /* unschedule no feedback timer */ From 26af3072b035daadf34a99d02510f0ff98f41f90 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 3 Dec 2006 14:51:14 -0200 Subject: [PATCH 04/12] [DCCP] ccid3: Fix warning message about illegal ACK This avoids a (harmless) warning message being printed at the DCCP server (the receiver of a DCCP half connection). Incoming packets are both directed to * ccid_hc_rx_packet_recv() for the server half * ccid_hc_tx_packet_recv() for the client half The message gets printed since on a server the client half is currently not sending data packets. This is resolved for the moment by checking the DCCP-role first. In future times (bidirectional DCCP connections), this test may have to be more sophisticated. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: Arnaldo Carvalho de Melo --- net/dccp/ccids/ccid3.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index bd353044c54..721efc7ed31 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -555,7 +555,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hctx->ccid3hctx_idle = 1; break; case TFRC_SSTATE_NO_SENT: - DCCP_WARN("Illegal ACK received - no packet has been sent\n"); + if (dccp_sk(sk)->dccps_role == DCCP_ROLE_CLIENT) + DCCP_WARN("Illegal ACK received - no packet sent\n"); /* fall through */ case TFRC_SSTATE_TERM: /* ignore feedback when closing */ break; From 50ab46c790a3408c441ba4c2faa9555cacc20028 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 3 Dec 2006 14:51:29 -0200 Subject: [PATCH 05/12] [DCCP] tfrc: Document boundaries and limits of the TFRC lookup table This adds documentation for the TCP Reno throughput equation which is at the heart of the TFRC sending rate / loss rate calculations. It spells out precisely how the values were determined and what they mean. The equations were derived through reverse engineering and found to be fully accurate (verified using test programs). This patch does not change any code. Signed-off-by: Gerrit Renker Signed-off-by: Ian McDonald Signed-off-by: Arnaldo Carvalho de Melo --- net/dccp/ccids/lib/tfrc_equation.c | 142 ++++++++++++++++++----------- 1 file changed, 87 insertions(+), 55 deletions(-) diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index 2601012383f..57665e97930 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -18,10 +18,76 @@ #include "tfrc.h" #define TFRC_CALC_X_ARRSIZE 500 +#define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */ -#define TFRC_CALC_X_SPLIT 50000 -/* equivalent to 0.05 */ +/* + TFRC TCP Reno Throughput Equation Lookup Table for f(p) + The following two-column lookup table implements a part of the TCP throughput + equation from [RFC 3448, sec. 3.1]: + + s + X_calc = -------------------------------------------------------------- + R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3)) + + Where: + X is the transmit rate in bytes/second + s is the packet size in bytes + R is the round trip time in seconds + p is the loss event rate, between 0 and 1.0, of the number of loss + events as a fraction of the number of packets transmitted + t_RTO is the TCP retransmission timeout value in seconds + b is the number of packets acknowledged by a single TCP ACK + + We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes: + + s + X_calc = ------------------------------------------------------- + R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3)) + + which we can break down into: + + s + X_calc = --------- + R * f(p) + + where f(p) is given for 0 < p <= 1 by: + + f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3) + + Since this is kernel code, floating-point arithmetic is avoided in favour of + integer arithmetic. This means that nearly all fractional parameters are + scaled by 1000000: + * the parameters p and R + * the return result f(p) + The lookup table therefore actually tabulates the following function g(q): + + g(q) = 1000000 * f(q/1000000) + + Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer + granularity for the practically more relevant case of small values of p (up to + 5%), the second column is used; the first one ranges up to 100%. This split + corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also + determines the smallest resolution. + + The entire table is generated by: + for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) { + lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE); + lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE); + } + + With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1, + lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%) + lookup[M][0] = g(1000000) = 1000000 * f(100%) + lookup[0][1] = g(TFRC_CALC_X_SPLIT/(M+1)) = 1000000 * f(0.01%) + lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%) + + In summary, the two columns represent f(p) for the following ranges: + * The first column is for 0.002 <= p <= 1.0 + * The second column is for 0.0001 <= p <= 0.05 + Where the columns overlap, the second (finer-grained) is given preference, + i.e. the first column is used only for p >= 0.05. + */ static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { { 37172, 8172 }, { 53499, 11567 }, @@ -525,62 +591,26 @@ static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { { 243315981, 271305 } }; -/* Calculate the send rate as per section 3.1 of RFC3448 - -Returns send rate in bytes per second - -Integer maths and lookups are used as not allowed floating point in kernel - -The function for Xcalc as per section 3.1 of RFC3448 is: - -X = s - ------------------------------------------------------------- - R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2))) - -where -X is the trasmit rate in bytes/second -s is the packet size in bytes -R is the round trip time in seconds -p is the loss event rate, between 0 and 1.0, of the number of loss events - as a fraction of the number of packets transmitted -t_RTO is the TCP retransmission timeout value in seconds -b is the number of packets acknowledged by a single TCP acknowledgement - -we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes: - -X = s - ----------------------------------------------------------------------- - R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2))) - - -which we can break down into: - -X = s - -------- - R * f(p) - -where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p)) - -Function parameters: -s - bytes -R - RTT in usecs -p - loss rate (decimal fraction multiplied by 1,000,000) - -Returns Xcalc in bytes per second - -DON'T alter this code unless you run test cases against it as the code -has been manipulated to stop underflow/overlow. - -*/ +/** + * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448 + * + * @s: packet size in bytes + * @R: RTT scaled by 1000000 (i.e., microseconds) + * @p: loss ratio estimate scaled by 1000000 + * Returns X_calc in bytes per second (not scaled). + * + * Note: DO NOT alter this code unless you run test cases against it, + * as the code has been optimized to stop underflow/overflow. + */ u32 tfrc_calc_x(u16 s, u32 R, u32 p) { int index; u32 f; u64 tmp1, tmp2; - if (p < TFRC_CALC_X_SPLIT) + if (p < TFRC_CALC_X_SPLIT) /* 0 <= p < 0.05 */ index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1; - else + else /* 0.05 <= p <= 1.00 */ index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1; if (index < 0) @@ -599,11 +629,13 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) else f = tfrc_calc_x_lookup[index][1]; + /* The following computes X = s/(R*f(p)) in bytes per second. Since f(p) + * and R are both scaled by 1000000, we need to multiply by 1000000^2. + * ==> DO NOT alter this unless you test against overflow on 32 bit */ tmp1 = ((u64)s * 100000000); tmp2 = ((u64)R * (u64)f); do_div(tmp2, 10000); do_div(tmp1, tmp2); - /* Don't alter above math unless you test due to overflow on 32 bit */ return (u32)tmp1; } @@ -611,10 +643,10 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) EXPORT_SYMBOL_GPL(tfrc_calc_x); /* - * args: fvalue - function value to match - * returns: p closest to that value + * tfrc_calc_x_reverse_lookup - try to find p given f(p) * - * both fvalue and p are multiplied by 1,000,000 to use ints + * @fvalue: function value to match, scaled by 1000000 + * Returns closest match for p, also scaled by 1000000 */ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) { From 90fb0e60dd9178dbca2e42c682c483cdb7ea9f2d Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 3 Dec 2006 14:52:01 -0200 Subject: [PATCH 06/12] [DCCP] tfrc: Fix small error in reverse lookup of p for given f(p) This fixes the following small error in tfrc_calc_x_reverse_lookup. 1) The table is generated by the following equations: lookup[index][0] = g((index+1) * 1000000/TFRC_CALC_X_ARRSIZE); lookup[index][1] = g((index+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE); where g(q) is 1E6 * f(q/1E6) 2) The reverse lookup assigns an entry in lookup[index][small] 3) This index needs to match the above, i.e. * if small=0 then p = (index + 1) * 1000000/TFRC_CALC_X_ARRSIZE * if small=1 then p = (index+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE These are exactly the changes that the patch makes; previously the code did not conform to the way the lookup table was generated (this difference resulted in a mean error of about 1.12%). Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: Arnaldo Carvalho de Melo --- net/dccp/ccids/lib/tfrc_equation.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index 57665e97930..78bdf348916 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -667,9 +667,9 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) ctr++; if (small) - return TFRC_CALC_X_SPLIT * ctr / TFRC_CALC_X_ARRSIZE; + return (ctr + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE; else - return 1000000 * ctr / TFRC_CALC_X_ARRSIZE; + return (ctr + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; } EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); From 8d0086adac0041de66b5f41b77eec0d8d239e16c Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 3 Dec 2006 14:52:26 -0200 Subject: [PATCH 07/12] [DCCP] tfrc: Add protection against invalid parameters to TFRC routines 1) For the forward X_calc lookup, it * protects effectively against RTT=0 (this case is possible), by returning the maximal lookup value instead of just setting it to 1 * reformulates the array-bounds exceeded condition: this only happens if p is greater than 1E6 (due to the scaling) * the case of negative indices can now with certainty be excluded, since documentation shows that the formulas are within bounds * additional protection against p = 0 (would give divide-by-zero) 2) For the reverse lookup, it warns against * protects against exceeding array bounds * now returns 0 if f(p) = 0, due to function definition * warns about minimal resolution error and returns the smallest table value instead of p=0 [this would mask congestion conditions] Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: Arnaldo Carvalho de Melo --- net/dccp/ccids/lib/tfrc_equation.c | 33 +++++++++++++++++------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index 78bdf348916..ef3233d45a6 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -608,22 +608,19 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) u32 f; u64 tmp1, tmp2; + /* check against invalid parameters and divide-by-zero */ + BUG_ON(p > 1000000); /* p must not exceed 100% */ + BUG_ON(p == 0); /* f(0) = 0, divide by zero */ + if (R == 0) { /* possible divide by zero */ + DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc."); + return ~0U; + } + if (p < TFRC_CALC_X_SPLIT) /* 0 <= p < 0.05 */ index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1; else /* 0.05 <= p <= 1.00 */ index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1; - if (index < 0) - /* p should be 0 unless there is a bug in my code */ - index = 0; - - if (R == 0) { - DCCP_WARN("RTT==0, setting to 1\n"); - R = 1; /* RTT can't be zero or else divide by zero */ - } - - BUG_ON(index >= TFRC_CALC_X_ARRSIZE); - if (p >= TFRC_CALC_X_SPLIT) f = tfrc_calc_x_lookup[index][0]; else @@ -653,13 +650,21 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) int ctr = 0; int small; - if (fvalue < tfrc_calc_x_lookup[0][1]) + if (fvalue == 0) /* f(p) = 0 whenever p = 0 */ return 0; + /* Error cases. */ + if (fvalue < tfrc_calc_x_lookup[0][1]) { + DCCP_WARN("fvalue %d smaller than resolution\n", fvalue); + return tfrc_calc_x_lookup[0][1]; + } + if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) { + DCCP_WARN("fvalue %d exceeds bounds!\n", fvalue); + return 1000000; + } + if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) small = 1; - else if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) - return 1000000; else small = 0; From 006042d7e1a0aae35c9dd9eb8ec71fa379679adb Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 3 Dec 2006 14:52:41 -0200 Subject: [PATCH 08/12] [DCCP] tfrc: Identify TFRC table limits and simplify code This * adds documentation about the lowest resolution that is possible within the bounds of the current lookup table * defines a constant TFRC_SMALLEST_P which defines this resolution * issues a warning if a given value of p is below resolution * combines two previously adjacent if-blocks of nearly identical structure into one This patch does not change the algorithm as such. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: Arnaldo Carvalho de Melo --- net/dccp/ccids/lib/tfrc_equation.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index ef3233d45a6..0a4a3d2feba 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -19,6 +19,7 @@ #define TFRC_CALC_X_ARRSIZE 500 #define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */ +#define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE) /* TFRC TCP Reno Throughput Equation Lookup Table for f(p) @@ -68,7 +69,9 @@ granularity for the practically more relevant case of small values of p (up to 5%), the second column is used; the first one ranges up to 100%. This split corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also - determines the smallest resolution. + determines the smallest resolution possible with this lookup table: + + TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE The entire table is generated by: for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) { @@ -79,7 +82,7 @@ With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1, lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%) lookup[M][0] = g(1000000) = 1000000 * f(100%) - lookup[0][1] = g(TFRC_CALC_X_SPLIT/(M+1)) = 1000000 * f(0.01%) + lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%) lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%) In summary, the two columns represent f(p) for the following ranges: @@ -616,15 +619,21 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) return ~0U; } - if (p < TFRC_CALC_X_SPLIT) /* 0 <= p < 0.05 */ - index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1; - else /* 0.05 <= p <= 1.00 */ - index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1; + if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ + if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ + DCCP_WARN("Value of p (%d) below resolution. " + "Substituting %d\n", p, TFRC_SMALLEST_P); + index = 0; + } else /* 0.0001 <= p <= 0.05 */ + index = p/TFRC_SMALLEST_P - 1; + + f = tfrc_calc_x_lookup[index][1]; + + } else { /* 0.05 < p <= 1.00 */ + index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1; - if (p >= TFRC_CALC_X_SPLIT) f = tfrc_calc_x_lookup[index][0]; - else - f = tfrc_calc_x_lookup[index][1]; + } /* The following computes X = s/(R*f(p)) in bytes per second. Since f(p) * and R are both scaled by 1000000, we need to multiply by 1000000^2. From 44158306d756c88272c8faf243ca68897498e219 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 3 Dec 2006 14:53:07 -0200 Subject: [PATCH 09/12] [DCCP] ccid3: Deprecate TFRC_SMALLEST_P This patch deprecates the existing use of an arbitrary value TFRC_SMALLEST_P for low-threshold values of p. This avoids masking low-resolution errors. Instead, the code now checks against real boundaries (implemented by preceding patch) and provides warnings whenever a real value falls below the threshold. If such messages are observed, it is a better solution to take this as an indication that the lookup table needs to be re-engineered. Changelog: ---------- This patch * makes handling all TFRC resolution errors local to the TFRC library * removes unnecessary test whether X_calc is 'infinity' due to p==0 -- this condition is already caught by tfrc_calc_x() * removes setting ccid3hctx_p = TFRC_SMALLEST_P in ccid3_hc_tx_packet_recv since this is now done by the TFRC library * updates BUG_ON test in ccid3_hc_tx_no_feedback_timer to take into account that p now is either 0 (and then X_calc is irrelevant), or it is > 0; since the handling of TFRC_SMALLEST_P is now taken care of in the tfrc library Justification: -------------- The TFRC code uses a lookup table which has a bounded resolution. The lowest possible value of the loss event rate `p' which can be resolved is currently 0.0001. Substituting this lower threshold for p when p is less than 0.0001 results in a huge, exponentially-growing error. The error can be computed by the following formula: (f(0.0001) - f(p))/f(p) * 100 for p < 0.0001 Currently the solution is to use an (arbitrary) value TFRC_SMALLEST_P = 40 * 1E-6 = 0.00004 and to consider all values below this value as `virtually zero'. Due to the exponentially growing resolution error, this is not a good idea, since it hides the fact that the table can not resolve practically occurring cases. Already at p == TFRC_SMALLEST_P, the error is as high as 58.19%! Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: Arnaldo Carvalho de Melo --- net/dccp/ccids/ccid3.c | 22 ++++++---------------- net/dccp/ccids/ccid3.h | 2 -- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 721efc7ed31..cf8c07b2704 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -137,8 +137,7 @@ static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now) struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk); const __u32 old_x = hctx->ccid3hctx_x; - /* To avoid large error in calcX */ - if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) { + if (hctx->ccid3hctx_p > 0) { hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s, hctx->ccid3hctx_rtt, hctx->ccid3hctx_p); @@ -226,16 +225,14 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) ccid3_tx_state_name(hctx->ccid3hctx_state)); /* Halve sending rate */ - /* If (X_calc > 2 * X_recv) + /* If (p == 0 || X_calc > 2 * X_recv) * X_recv = max(X_recv / 2, s / (2 * t_mbi)); * Else * X_recv = X_calc / 4; */ - BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P && - hctx->ccid3hctx_x_calc == 0); + BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc); - /* check also if p is zero -> x_calc is infinity? */ - if (hctx->ccid3hctx_p < TFRC_SMALLEST_P || + if (hctx->ccid3hctx_p == 0 || hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv) hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2, hctx->ccid3hctx_s / (2 * TFRC_T_MBI)); @@ -449,15 +446,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) pinv = opt_recv->ccid3or_loss_event_rate; if (pinv == ~0U || pinv == 0) hctx->ccid3hctx_p = 0; - else { - hctx->ccid3hctx_p = 1000000 / pinv; - - if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) { - hctx->ccid3hctx_p = TFRC_SMALLEST_P; - ccid3_pr_debug("%s, sk=%p, Smallest p used!\n", - dccp_role(sk), sk); - } - } + else + hctx->ccid3hctx_p = 1000000 / pinv; dccp_timestamp(sk, &now); diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 27cb20ae1da..07596d704ef 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -51,8 +51,6 @@ /* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ #define TFRC_T_MBI 64 -#define TFRC_SMALLEST_P 40 - enum ccid3_options { TFRC_OPT_LOSS_EVENT_RATE = 192, TFRC_OPT_LOSS_INTERVALS = 193, From 2bbf29acd8f7adcf161de7e5d891b4095687a59f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 3 Dec 2006 14:53:27 -0200 Subject: [PATCH 10/12] [DCCP] tfrc: Binary search for reverse TFRC lookup This replaces the linear search algorithm for reverse lookup with binary search. It has the advantage of better scalability: O(log2(N)) instead of O(N). This means that the average number of iterations is reduced from 250 (linear search if each value appears equally likely) down to at most 9. Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: Arnaldo Carvalho de Melo --- net/dccp/ccids/lib/tfrc_equation.c | 38 +++++++++++++++++++----------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index 0a4a3d2feba..ddac2c511e2 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -594,6 +594,21 @@ static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { { 243315981, 271305 } }; +/* return largest index i such that fval <= lookup[i][small] */ +static inline u32 tfrc_binsearch(u32 fval, u8 small) +{ + u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1; + + while (low < high) { + try = (low + high) / 2; + if (fval <= tfrc_calc_x_lookup[try][small]) + high = try; + else + low = try + 1; + } + return high; +} + /** * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448 * @@ -656,8 +671,7 @@ EXPORT_SYMBOL_GPL(tfrc_calc_x); */ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) { - int ctr = 0; - int small; + int index; if (fvalue == 0) /* f(p) = 0 whenever p = 0 */ return 0; @@ -672,18 +686,14 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) return 1000000; } - if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) - small = 1; - else - small = 0; - - while (fvalue > tfrc_calc_x_lookup[ctr][small]) - ctr++; - - if (small) - return (ctr + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE; - else - return (ctr + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; + if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) { + index = tfrc_binsearch(fvalue, 1); + return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE; + } + + /* else ... it must be in the coarse-grained column */ + index = tfrc_binsearch(fvalue, 0); + return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; } EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); From b4ad86bf52469b26148c677cb59d8bc81f129cc2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 3 Dec 2006 19:19:26 -0800 Subject: [PATCH 11/12] [XFRM] xfrm_user: Better validation of user templates. Since we never checked the ->family value of templates before, many applications simply leave it at zero. Detect this and fix it up to be the pol->family value. Also, do not clobber xp->family while reading in templates, that is not necessary. Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 50 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 6f97665983d..311205ffa77 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -858,7 +858,6 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, int i; xp->xfrm_nr = nr; - xp->family = ut->family; for (i = 0; i < nr; i++, ut++) { struct xfrm_tmpl *t = &xp->xfrm_vec[i]; @@ -876,19 +875,53 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, } } +static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) +{ + int i; + + if (nr > XFRM_MAX_DEPTH) + return -EINVAL; + + for (i = 0; i < nr; i++) { + /* We never validated the ut->family value, so many + * applications simply leave it at zero. The check was + * never made and ut->family was ignored because all + * templates could be assumed to have the same family as + * the policy itself. Now that we will have ipv4-in-ipv6 + * and ipv6-in-ipv4 tunnels, this is no longer true. + */ + if (!ut[i].family) + ut[i].family = family; + + switch (ut[i].family) { + case AF_INET: + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + break; +#endif + default: + return -EINVAL; + }; + } + + return 0; +} + static int copy_from_user_tmpl(struct xfrm_policy *pol, struct rtattr **xfrma) { struct rtattr *rt = xfrma[XFRMA_TMPL-1]; - struct xfrm_user_tmpl *utmpl; - int nr; if (!rt) { pol->xfrm_nr = 0; } else { - nr = (rt->rta_len - sizeof(*rt)) / sizeof(*utmpl); + struct xfrm_user_tmpl *utmpl = RTA_DATA(rt); + int nr = (rt->rta_len - sizeof(*rt)) / sizeof(*utmpl); + int err; - if (nr > XFRM_MAX_DEPTH) - return -EINVAL; + err = validate_tmpl(nr, utmpl, pol->family); + if (err) + return err; copy_templates(pol, RTA_DATA(rt), nr); } @@ -1530,7 +1563,8 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, void **xf } /* build an XP */ - xp = xfrm_policy_construct(&ua->policy, (struct rtattr **) xfrma, &err); if (!xp) { + xp = xfrm_policy_construct(&ua->policy, (struct rtattr **) xfrma, &err); + if (!xp) { kfree(x); return err; } @@ -1979,7 +2013,7 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt, return NULL; nr = ((len - sizeof(*p)) / sizeof(*ut)); - if (nr > XFRM_MAX_DEPTH) + if (validate_tmpl(nr, ut, p->sel.family)) return NULL; if (p->dir > XFRM_POLICY_OUT) From ef9467f8f0803881d6b20ad6f0f770fc39bcc2c2 Mon Sep 17 00:00:00 2001 From: Jurij Smakov Date: Sun, 3 Dec 2006 19:33:02 -0800 Subject: [PATCH 12/12] [SUNHME]: Fix for sunhme failures on x86 The following patch fixes the failure of sunhme drivers on x86 hosts due to missing pci_enable_device() and pci_set_master() calls, lost during code refactoring. It has been filed as bugzilla bug #7502 [0] and Debian bug #397460 [1]. [0] http://bugzilla.kernel.org/show_bug.cgi?id=7502 [1] http://bugs.debian.org/397460 Signed-off-by: Jurij Smakov Signed-off-by: David S. Miller --- drivers/net/sunhme.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/sunhme.c b/drivers/net/sunhme.c index ec432ea879f..df40e34c776 100644 --- a/drivers/net/sunhme.c +++ b/drivers/net/sunhme.c @@ -3012,6 +3012,11 @@ static int __devinit happy_meal_pci_probe(struct pci_dev *pdev, #endif err = -ENODEV; + + if (pci_enable_device(pdev)) + goto err_out; + pci_set_master(pdev); + if (!strcmp(prom_name, "SUNW,qfe") || !strcmp(prom_name, "qfe")) { qp = quattro_pci_find(pdev); if (qp == NULL)