From 8a508ac26e72aa203677aa6a8464bd3ea44216a6 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sun, 3 Dec 2006 14:50:23 -0200
Subject: [PATCH 01/12] [DCCP]: Use higher RTO default for CCID3

The TFRC nofeedback timer normally expires after the maximum of 4
RTTs and twice the current send interval (RFC 3448, 4.3). On LANs
with a small RTT this can mean a high processing load and reduced
performance, since then the nofeedback timer is triggered very
frequently.

This patch provides a configuration option to set the bound for the
nofeedback timer, using as default 100 milliseconds.

By setting the configuration option to 0, strict RFC 3448 behaviour
can be enforced for the nofeedback timer.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/dccp/ccids/Kconfig | 33 +++++++++++++++++++++++++++++++++
 net/dccp/ccids/ccid3.c | 21 +++++++++++++--------
 2 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index dac89166eb1..80f46988769 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -89,4 +89,37 @@ config IP_DCCP_CCID3_DEBUG
 	    parameter to 0 or 1.
 
 	    If in doubt, say N.
+
+config IP_DCCP_CCID3_RTO
+	  int "Use higher bound for nofeedback timer"
+	  default 100
+	  depends on IP_DCCP_CCID3 && EXPERIMENTAL
+	  ---help---
+	    Use higher lower bound for nofeedback timer expiration.
+
+	    The TFRC nofeedback timer normally expires after the maximum of 4
+	    RTTs and twice the current send interval (RFC 3448, 4.3). On LANs
+	    with a small RTT this can mean a high processing load and reduced
+	    performance, since then the nofeedback timer is triggered very
+	    frequently.
+
+	    This option enables to set a higher lower bound for the nofeedback
+	    value. Values in units of milliseconds can be set here.
+
+	    A value of 0 disables this feature by enforcing the value specified
+	    in RFC 3448. The following values have been suggested as bounds for
+	    experimental use:
+	    	* 16-20ms to match the typical multimedia inter-frame interval
+	    	* 100ms as a reasonable compromise [default]
+	    	* 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4)
+
+	    The default of 100ms is a compromise between a large value for
+	    efficient DCCP implementations, and a small value to avoid disrupting
+	    the network in times of congestion.
+
+	    The purpose of the nofeedback timer is to slow DCCP down when there
+	    is serious network congestion: experimenting with larger values should
+	    therefore not be performed on WANs.
+
+
 endmenu
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 70ebe705eb7..99807783a22 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -245,9 +245,10 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
 		}
 		/*
 		 * Schedule no feedback timer to expire in
-		 * max(4 * R, 2 * s/X)  =  max(4 * R, 2 * t_ipi)
+		 * max(t_RTO, 2 * s/X)  =  max(t_RTO, 2 * t_ipi)
+		 * See comments in packet_recv() regarding the value of t_RTO.
 		 */
-		t_nfb = max(4 * hctx->ccid3hctx_rtt, 2 * hctx->ccid3hctx_t_ipi);
+		t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
 		break;
 	case TFRC_SSTATE_NO_SENT:
 		DCCP_BUG("Illegal %s state NO_SENT, sk=%p", dccp_role(sk), sk);
@@ -512,16 +513,20 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 		 */
 		sk->sk_write_space(sk);
 
-		/* Update timeout interval. We use the alternative variant of
-		 * [RFC 3448, 3.1] which sets the upper bound of t_rto to one
-		 * second, as it is suggested for TCP (see RFC 2988, 2.4). */
+		/*
+		 * Update timeout interval for the nofeedback timer.
+		 * We use a configuration option to increase the lower bound.
+		 * This can help avoid triggering the nofeedback timer too often
+		 * ('spinning') on LANs with small RTTs.
+		 */
 		hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
-					      	   USEC_PER_SEC            );
+						   CONFIG_IP_DCCP_CCID3_RTO *
+						   (USEC_PER_SEC/1000)	     );
 		/*
 		 * Schedule no feedback timer to expire in
-		 * max(4 * R, 2 * s/X)  =  max(4 * R, 2 * t_ipi)
+		 * max(t_RTO, 2 * s/X)  =  max(t_RTO, 2 * t_ipi)
 		 */
-		t_nfb = max(4 * hctx->ccid3hctx_rtt, 2 * hctx->ccid3hctx_t_ipi);
+		t_nfb = max(hctx->ccid3hctx_t_rto, 2 * hctx->ccid3hctx_t_ipi);
 			
 		ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to "
 			       "expire in %lu jiffies (%luus)\n",

From 76d127779e51fbf67ad6793214369aa1fea90278 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sun, 3 Dec 2006 14:50:42 -0200
Subject: [PATCH 02/12] [DCCP]: Fix BUG in retransmission delay calculation

This bug resulted in ccid3_hc_tx_send_packet returning negative
delay values, which in turn triggered silently dequeueing packets in
dccp_write_xmit. As a result, only a few out of the submitted packets made
it at all onto the network.  Occasionally, when dccp_wait_for_ccid was
involved, this also triggered a bug warning since ccid3_hc_tx_send_packet
returned a negative value (which in reality was a negative delay value).

The cause for this bug lies in the comparison

 if (delay >= hctx->ccid3hctx_delta)
	return delay / 1000L;

The type of `delay' is `long', that of ccid3hctx_delta is `u32'. When comparing
negative long values against u32 values, the test returned `true' whenever delay
was smaller than 0 (meaning the packet was overdue to send).

The fix is by casting, subtracting, and then testing the difference with
regard to 0.

This has been tested and shown to work.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/dccp/ccids/ccid3.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 99807783a22..22a07248c24 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -339,7 +339,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
 		 * else
 		 *       // send the packet in (t_nom - t_now) milliseconds.
 		 */
-		if (delay >= hctx->ccid3hctx_delta)
+		if (delay - (long)hctx->ccid3hctx_delta >= 0)
 			return delay / 1000L;
 		break;
 	case TFRC_SSTATE_TERM:

From 5c3fbb6acf9d32772ec7fc01cedd9478d0e26f44 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sun, 3 Dec 2006 14:50:56 -0200
Subject: [PATCH 03/12] [DCCP] ccid3: Fix bug in calculation of send rate

The main object of this patch is the following bug:
 ==> In ccid3_hc_tx_packet_recv, the parameters p and X_recv were updated
     _after_ the send rate was calculated. This is clearly an error and is
     resolved by re-ordering statements.

In addition,
  * r_sample is converted from u32 to long to check whether the time difference
    was negative (it would otherwise be converted to a large u32 value)
  * protection against RTT=0 (this is possible) is provided in a further patch
  * t_elapsed is also converted to long, to match the type of r_sample
  * adds a a more debugging information regarding current send rates
  * various trivial comment/documentation updates

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/dccp/ccids/ccid3.c | 89 ++++++++++++++++++++++++------------------
 1 file changed, 52 insertions(+), 37 deletions(-)

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 22a07248c24..bd353044c54 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -121,12 +121,15 @@ static inline void ccid3_update_send_time(struct ccid3_hc_tx_sock *hctx)
 /*
  * Update X by
  *    If (p > 0)
- *       x_calc = calcX(s, R, p);
+ *       X_calc = calcX(s, R, p);
  *       X = max(min(X_calc, 2 * X_recv), s / t_mbi);
  *    Else
  *       If (now - tld >= R)
  *          X = max(min(2 * X, 2 * X_recv), s / R);
  *          tld = now;
+ *
+ * If X has changed, we also update the scheduled send time t_now,
+ * the inter-packet interval t_ipi, and the delta value.
  */ 
 static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now)
 
@@ -413,10 +416,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 	struct dccp_tx_hist_entry *packet;
 	struct timeval now;
 	unsigned long t_nfb;
-	u32 t_elapsed;
 	u32 pinv;
-	u32 x_recv;
-	u32 r_sample;
+	long r_sample, t_elapsed;
 
 	BUG_ON(hctx == NULL);
 
@@ -427,31 +428,51 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 
 	opt_recv = &hctx->ccid3hctx_options_received;
 
-	t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10;
-	x_recv = opt_recv->ccid3or_receive_rate;
-	pinv = opt_recv->ccid3or_loss_event_rate;
-
 	switch (hctx->ccid3hctx_state) {
 	case TFRC_SSTATE_NO_FBACK:
 	case TFRC_SSTATE_FBACK:
-		/* Calculate new round trip sample by
-		 * R_sample = (now - t_recvdata) - t_delay */
-		/* get t_recvdata from history */
+		/* get packet from history to look up t_recvdata */
 		packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist,
 						 DCCP_SKB_CB(skb)->dccpd_ack_seq);
 		if (unlikely(packet == NULL)) {
-			DCCP_WARN("%s, sk=%p, seqno %llu(%s) does't exist "
+			DCCP_WARN("%s(%p), seqno %llu(%s) doesn't exist "
 				  "in history!\n",  dccp_role(sk), sk,
 			    (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
 				  dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
 			return;
 		}
 
-		/* Update RTT */
+		/* Update receive rate */
+		hctx->ccid3hctx_x_recv = opt_recv->ccid3or_receive_rate;
+
+		/* Update loss event rate */
+		pinv = opt_recv->ccid3or_loss_event_rate;
+		if (pinv == ~0U || pinv == 0)
+			hctx->ccid3hctx_p = 0;
+		else {
+			hctx->ccid3hctx_p = 1000000 / pinv;
+
+			if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) {
+				hctx->ccid3hctx_p = TFRC_SMALLEST_P;
+				ccid3_pr_debug("%s, sk=%p, Smallest p used!\n",
+					       dccp_role(sk), sk);
+			}
+		}
+
 		dccp_timestamp(sk, &now);
-		r_sample = timeval_delta(&now, &packet->dccphtx_tstamp);
-		if (unlikely(r_sample <= t_elapsed))
-			DCCP_WARN("r_sample=%uus,t_elapsed=%uus\n",
+
+		/*
+		 * Calculate new round trip sample as per [RFC 3448, 4.3] by
+		 * 	R_sample  =  (now - t_recvdata) - t_elapsed
+		 */
+		r_sample  = timeval_delta(&now, &packet->dccphtx_tstamp);
+		t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10;
+
+		if (unlikely(r_sample <= 0)) {
+			DCCP_WARN("WARNING: R_sample (%ld) <= 0!\n", r_sample);
+			r_sample = 0;
+		} else if (unlikely(r_sample <= t_elapsed))
+			DCCP_WARN("WARNING: r_sample=%ldus <= t_elapsed=%ldus\n",
 				  r_sample, t_elapsed);
 		else
 			r_sample -= t_elapsed;
@@ -474,31 +495,25 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 			hctx->ccid3hctx_t_ld = now;
 
 			ccid3_update_send_time(hctx);
+
+			ccid3_pr_debug("%s(%p), s=%u, w_init=%u, "
+				       "R_sample=%ldus, X=%u\n", dccp_role(sk),
+				       sk, hctx->ccid3hctx_s, w_init, r_sample,
+				       hctx->ccid3hctx_x);
+
 			ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
 		} else {
-			hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 +
-					      r_sample / 10;
+			hctx->ccid3hctx_rtt = (9 * hctx->ccid3hctx_rtt +
+					           (u32)r_sample        ) / 10;
+
 			ccid3_hc_tx_update_x(sk, &now);
-		}
 
-		ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, "
-			       "r_sample=%us\n", dccp_role(sk), sk,
-			       hctx->ccid3hctx_rtt, r_sample);
-
-		/* Update receive rate */
-		hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */
-
-		/* Update loss event rate */
-		if (pinv == ~0 || pinv == 0)
-			hctx->ccid3hctx_p = 0;
-		else {
-			hctx->ccid3hctx_p = 1000000 / pinv;
-
-			if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) {
-				hctx->ccid3hctx_p = TFRC_SMALLEST_P;
-				ccid3_pr_debug("%s, sk=%p, Smallest p used!\n",
-					       dccp_role(sk), sk);
-			}
+			ccid3_pr_debug("%s(%p), RTT=%uus (sample=%ldus), s=%u, "
+				       "p=%u, X_calc=%u, X=%u\n", dccp_role(sk),
+				       sk, hctx->ccid3hctx_rtt, r_sample,
+				       hctx->ccid3hctx_s, hctx->ccid3hctx_p,
+				       hctx->ccid3hctx_x_calc,
+				       hctx->ccid3hctx_x);
 		}
 
 		/* unschedule no feedback timer */

From 26af3072b035daadf34a99d02510f0ff98f41f90 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sun, 3 Dec 2006 14:51:14 -0200
Subject: [PATCH 04/12] [DCCP] ccid3: Fix warning message about illegal ACK

This avoids a (harmless) warning message being printed at the DCCP server
(the receiver of a DCCP half connection).

Incoming packets are both directed to

 * ccid_hc_rx_packet_recv() for the server half
 * ccid_hc_tx_packet_recv() for the client half

The message gets printed since on a server the client half is currently not
sending data packets.
This is resolved for the moment by checking the DCCP-role first. In future
times (bidirectional DCCP connections), this test may have to be more
sophisticated.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/dccp/ccids/ccid3.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index bd353044c54..721efc7ed31 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -555,7 +555,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 		hctx->ccid3hctx_idle = 1;   
 		break;
 	case TFRC_SSTATE_NO_SENT:
-		DCCP_WARN("Illegal ACK received - no packet has been sent\n");
+		if (dccp_sk(sk)->dccps_role == DCCP_ROLE_CLIENT)
+			DCCP_WARN("Illegal ACK received - no packet sent\n");
 		/* fall through */
 	case TFRC_SSTATE_TERM:		/* ignore feedback when closing */
 		break;

From 50ab46c790a3408c441ba4c2faa9555cacc20028 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sun, 3 Dec 2006 14:51:29 -0200
Subject: [PATCH 05/12] [DCCP] tfrc: Document boundaries and limits of the TFRC
 lookup table

This adds documentation for the TCP Reno throughput equation which is at
the heart of the TFRC sending rate / loss rate calculations.

It spells out precisely how the values were determined and what they mean.
The equations were derived through reverse engineering and found to be
fully accurate (verified using test programs).

This patch does not change any code.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/dccp/ccids/lib/tfrc_equation.c | 142 ++++++++++++++++++-----------
 1 file changed, 87 insertions(+), 55 deletions(-)

diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 2601012383f..57665e97930 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -18,10 +18,76 @@
 #include "tfrc.h"
 
 #define TFRC_CALC_X_ARRSIZE 500
+#define TFRC_CALC_X_SPLIT   50000	/* 0.05 * 1000000, details below */
 
-#define TFRC_CALC_X_SPLIT 50000
-/* equivalent to 0.05 */
+/*
+  TFRC TCP Reno Throughput Equation Lookup Table for f(p)
 
+  The following two-column lookup table implements a part of the TCP throughput
+  equation from [RFC 3448, sec. 3.1]:
+
+	 			     s
+  X_calc  =  --------------------------------------------------------------
+	     R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3))
+
+  Where:
+	X      is the transmit rate in bytes/second
+	s      is the packet size in bytes
+	R      is the round trip time in seconds
+	p      is the loss event rate, between 0 and 1.0, of the number of loss
+	              events as a fraction of the number of packets transmitted
+	t_RTO  is the TCP retransmission timeout value in seconds
+	b      is the number of packets acknowledged by a single TCP ACK
+
+  We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes:
+
+				     s
+  X_calc  =  -------------------------------------------------------
+	     R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3))
+
+  which we can break down into:
+
+                      s
+	X_calc  =  ---------
+	            R * f(p)
+
+  where f(p) is given for 0 < p <= 1 by:
+
+	f(p)  =  sqrt(2*p/3) + 12 * sqrt(3*p/8) *  (p + 32*p^3)
+
+  Since this is kernel code, floating-point arithmetic is avoided in favour of
+  integer arithmetic. This means that nearly all fractional parameters are
+  scaled by 1000000:
+    * the parameters p and R
+    * the return result f(p)
+  The lookup table therefore actually tabulates the following function g(q):
+
+  	g(q)  =  1000000 * f(q/1000000)
+
+  Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer
+  granularity for the practically more relevant case of small values of p (up to
+  5%), the second column is used; the first one ranges up to 100%.  This split
+  corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also
+  determines the smallest resolution.
+
+  The entire table is generated by:
+    for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) {
+	lookup[i][0]  =  g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE);
+	lookup[i][1]  =  g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE);
+    }
+
+  With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1,
+    lookup[0][0]  =  g(1000000/(M+1)) 		 =  1000000 * f(0.2%)
+    lookup[M][0]  =  g(1000000)			 =  1000000 * f(100%)
+    lookup[0][1]  =  g(TFRC_CALC_X_SPLIT/(M+1))	 =  1000000 * f(0.01%)
+    lookup[M][1]  =  g(TFRC_CALC_X_SPLIT)	 =  1000000 * f(5%)
+
+  In summary, the two columns represent f(p) for the following ranges:
+    * The first column is for   0.002  <= p <= 1.0
+    * The second column is for  0.0001 <= p <= 0.05
+  Where the columns overlap, the second (finer-grained) is given preference,
+  i.e. the first column is used only for p >= 0.05.
+ */
 static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
 	{     37172,   8172 },
 	{     53499,  11567 },
@@ -525,62 +591,26 @@ static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
 	{ 243315981, 271305 }
 };
 
-/* Calculate the send rate as per section 3.1 of RFC3448
- 
-Returns send rate in bytes per second
-
-Integer maths and lookups are used as not allowed floating point in kernel
-
-The function for Xcalc as per section 3.1 of RFC3448 is:
-
-X =                            s
-     -------------------------------------------------------------
-     R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2)))
-
-where 
-X is the trasmit rate in bytes/second
-s is the packet size in bytes
-R is the round trip time in seconds
-p is the loss event rate, between 0 and 1.0, of the number of loss events 
-  as a fraction of the number of packets transmitted
-t_RTO is the TCP retransmission timeout value in seconds
-b is the number of packets acknowledged by a single TCP acknowledgement
-
-we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes:
-
-X =                            s
-     -----------------------------------------------------------------------
-     R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2)))
-
-
-which we can break down into:
-
-X =     s
-     --------
-     R * f(p)
-
-where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p))
-
-Function parameters:
-s - bytes
-R - RTT in usecs
-p - loss rate (decimal fraction multiplied by 1,000,000)
-
-Returns Xcalc in bytes per second
-
-DON'T alter this code unless you run test cases against it as the code
-has been manipulated to stop underflow/overlow.
-
-*/
+/**
+ * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448
+ *
+ *  @s: packet size          in bytes
+ *  @R: RTT                  scaled by 1000000   (i.e., microseconds)
+ *  @p: loss ratio estimate  scaled by 1000000
+ *  Returns X_calc           in bytes per second (not scaled).
+ *
+ * Note: DO NOT alter this code unless you run test cases against it,
+ *       as the code has been optimized to stop underflow/overflow.
+ */
 u32 tfrc_calc_x(u16 s, u32 R, u32 p)
 {
 	int index;
 	u32 f;
 	u64 tmp1, tmp2;
 
-	if (p < TFRC_CALC_X_SPLIT)
+	if (p < TFRC_CALC_X_SPLIT) 		      /* 0      <= p <  0.05  */
 		index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1;
-	else
+	else		 			      /* 0.05   <= p <= 1.00  */
 		index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1;
 
 	if (index < 0)
@@ -599,11 +629,13 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
 	else
 		f = tfrc_calc_x_lookup[index][1];
 
+	/* The following computes X = s/(R*f(p)) in bytes per second. Since f(p)
+	 * and R are both scaled by 1000000, we need to multiply by 1000000^2.
+	 * ==> DO NOT alter this unless you test against overflow on 32 bit   */
 	tmp1 = ((u64)s * 100000000);
 	tmp2 = ((u64)R * (u64)f);
 	do_div(tmp2, 10000);
 	do_div(tmp1, tmp2); 
-	/* Don't alter above math unless you test due to overflow on 32 bit */
 
 	return (u32)tmp1; 
 }
@@ -611,10 +643,10 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
 EXPORT_SYMBOL_GPL(tfrc_calc_x);
 
 /*
- * args: fvalue - function value to match
- * returns: p closest to that value
+ *  tfrc_calc_x_reverse_lookup  -  try to find p given f(p)
  *
- * both fvalue and p are multiplied by 1,000,000 to use ints
+ *  @fvalue: function value to match, scaled by 1000000
+ *  Returns closest match for p, also scaled by 1000000
  */
 u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
 {

From 90fb0e60dd9178dbca2e42c682c483cdb7ea9f2d Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sun, 3 Dec 2006 14:52:01 -0200
Subject: [PATCH 06/12] [DCCP] tfrc: Fix small error in reverse lookup of p for
 given f(p)

This fixes the following small error in tfrc_calc_x_reverse_lookup.

 1) The table is generated by the following equations:
	lookup[index][0] = g((index+1) * 1000000/TFRC_CALC_X_ARRSIZE);
	lookup[index][1] = g((index+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE);
    where g(q) is 1E6 * f(q/1E6)

 2) The reverse lookup assigns an entry in lookup[index][small]

 3) This index needs to match the above, i.e.
    * if small=0 then

      		p  = (index + 1) * 1000000/TFRC_CALC_X_ARRSIZE

    * if small=1 then

		p = (index+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE

These are exactly the changes that the patch makes; previously the code did
not conform to the way the lookup table was generated (this difference resulted
in a mean error of about 1.12%).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/dccp/ccids/lib/tfrc_equation.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 57665e97930..78bdf348916 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -667,9 +667,9 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
 		ctr++;
 
 	if (small)
-		return TFRC_CALC_X_SPLIT * ctr / TFRC_CALC_X_ARRSIZE;
+		return (ctr + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE;
 	else
-		return 1000000 * ctr / TFRC_CALC_X_ARRSIZE;
+		return (ctr + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
 }
 
 EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);

From 8d0086adac0041de66b5f41b77eec0d8d239e16c Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sun, 3 Dec 2006 14:52:26 -0200
Subject: [PATCH 07/12] [DCCP] tfrc: Add protection against invalid parameters
 to TFRC routines

 1) For the forward X_calc lookup, it
    * protects effectively against RTT=0 (this case is possible), by
      returning the maximal lookup value instead of just setting it to 1
    * reformulates the array-bounds exceeded condition: this only happens
      if p is greater than 1E6 (due to the scaling)
    * the case of negative indices can now with certainty be excluded,
      since documentation shows that the formulas are within bounds
    * additional protection against p = 0 (would give divide-by-zero)

 2) For the reverse lookup, it warns against
    * protects against exceeding array bounds
    * now returns 0 if f(p) = 0, due to function definition
    * warns about minimal resolution error and returns the smallest table
      value instead of p=0 [this would mask congestion conditions]

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/dccp/ccids/lib/tfrc_equation.c | 33 +++++++++++++++++-------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 78bdf348916..ef3233d45a6 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -608,22 +608,19 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
 	u32 f;
 	u64 tmp1, tmp2;
 
+	/* check against invalid parameters and divide-by-zero   */
+	BUG_ON(p >  1000000);		/* p must not exceed 100%   */
+	BUG_ON(p == 0);			/* f(0) = 0, divide by zero */
+	if (R == 0) {			/* possible  divide by zero */
+		DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc.");
+		return ~0U;
+ 	}
+
 	if (p < TFRC_CALC_X_SPLIT) 		      /* 0      <= p <  0.05  */
 		index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1;
 	else		 			      /* 0.05   <= p <= 1.00  */
 		index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1;
 
-	if (index < 0)
-		/* p should be 0 unless there is a bug in my code */
-		index = 0;
-
-	if (R == 0) {
-		DCCP_WARN("RTT==0, setting to 1\n");
-		R = 1; /* RTT can't be zero or else divide by zero */
-	}
-
-	BUG_ON(index >= TFRC_CALC_X_ARRSIZE);
-
 	if (p >= TFRC_CALC_X_SPLIT)
 		f = tfrc_calc_x_lookup[index][0];
 	else
@@ -653,13 +650,21 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
 	int ctr = 0;
 	int small;
 
-	if (fvalue < tfrc_calc_x_lookup[0][1])
+	if (fvalue == 0)	/* f(p) = 0  whenever  p = 0 */
 		return 0;
 
+	/* Error cases. */
+	if (fvalue < tfrc_calc_x_lookup[0][1]) {
+		DCCP_WARN("fvalue %d smaller than resolution\n", fvalue);
+		return tfrc_calc_x_lookup[0][1];
+	}
+	if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) {
+		DCCP_WARN("fvalue %d exceeds bounds!\n", fvalue);
+		return 1000000;
+	}
+
 	if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1])
 		small = 1;
-	else if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0])
-		return 1000000;
 	else
 		small = 0;
 

From 006042d7e1a0aae35c9dd9eb8ec71fa379679adb Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sun, 3 Dec 2006 14:52:41 -0200
Subject: [PATCH 08/12] [DCCP] tfrc: Identify TFRC table limits and simplify
 code

This
 * adds documentation about the lowest resolution that is possible within
   the bounds of the current lookup table
 * defines a constant TFRC_SMALLEST_P which defines this resolution
 * issues a warning if a given value of p is below resolution
 * combines two previously adjacent if-blocks of nearly identical
   structure into one

This patch does not change the algorithm as such.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/dccp/ccids/lib/tfrc_equation.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index ef3233d45a6..0a4a3d2feba 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -19,6 +19,7 @@
 
 #define TFRC_CALC_X_ARRSIZE 500
 #define TFRC_CALC_X_SPLIT   50000	/* 0.05 * 1000000, details below */
+#define TFRC_SMALLEST_P	    (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE)
 
 /*
   TFRC TCP Reno Throughput Equation Lookup Table for f(p)
@@ -68,7 +69,9 @@
   granularity for the practically more relevant case of small values of p (up to
   5%), the second column is used; the first one ranges up to 100%.  This split
   corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also
-  determines the smallest resolution.
+  determines the smallest resolution possible with this lookup table:
+
+    TFRC_SMALLEST_P   =  TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE
 
   The entire table is generated by:
     for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) {
@@ -79,7 +82,7 @@
   With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1,
     lookup[0][0]  =  g(1000000/(M+1)) 		 =  1000000 * f(0.2%)
     lookup[M][0]  =  g(1000000)			 =  1000000 * f(100%)
-    lookup[0][1]  =  g(TFRC_CALC_X_SPLIT/(M+1))	 =  1000000 * f(0.01%)
+    lookup[0][1]  =  g(TFRC_SMALLEST_P)		  = 1000000 * f(0.01%)
     lookup[M][1]  =  g(TFRC_CALC_X_SPLIT)	 =  1000000 * f(5%)
 
   In summary, the two columns represent f(p) for the following ranges:
@@ -616,15 +619,21 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p)
 		return ~0U;
  	}
 
-	if (p < TFRC_CALC_X_SPLIT) 		      /* 0      <= p <  0.05  */
-		index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1;
-	else		 			      /* 0.05   <= p <= 1.00  */
-		index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1;
+	if (p <= TFRC_CALC_X_SPLIT) 		{     /* 0.0000 < p <= 0.05   */
+		if (p < TFRC_SMALLEST_P) {	      /* 0.0000 < p <  0.0001 */
+			DCCP_WARN("Value of p (%d) below resolution. "
+				  "Substituting %d\n", p, TFRC_SMALLEST_P);
+			index = 0;
+		} else 				      /* 0.0001 <= p <= 0.05  */
+			index =  p/TFRC_SMALLEST_P - 1;
+
+ 		f = tfrc_calc_x_lookup[index][1];
+
+	} else {	 			      /* 0.05   <  p <= 1.00  */
+		index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1;
 
-	if (p >= TFRC_CALC_X_SPLIT)
 		f = tfrc_calc_x_lookup[index][0];
-	else
-		f = tfrc_calc_x_lookup[index][1];
+	}
 
 	/* The following computes X = s/(R*f(p)) in bytes per second. Since f(p)
 	 * and R are both scaled by 1000000, we need to multiply by 1000000^2.

From 44158306d756c88272c8faf243ca68897498e219 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sun, 3 Dec 2006 14:53:07 -0200
Subject: [PATCH 09/12] [DCCP] ccid3: Deprecate TFRC_SMALLEST_P

 This patch deprecates the existing use of an arbitrary value TFRC_SMALLEST_P
 for low-threshold values of p. This avoids masking low-resolution errors.
 Instead, the code now checks against real boundaries (implemented by preceding
 patch) and provides warnings whenever a real value falls below the threshold.

 If such messages are observed, it is a better solution to take this as an
 indication that the lookup table needs to be re-engineered.

Changelog:
----------
 This patch
   * makes handling all TFRC resolution errors local to the TFRC library

   * removes unnecessary test whether X_calc is 'infinity' due to p==0 -- this
     condition is already caught by tfrc_calc_x()

   * removes setting ccid3hctx_p = TFRC_SMALLEST_P in ccid3_hc_tx_packet_recv
     since this is now done by the TFRC library

   * updates BUG_ON test in ccid3_hc_tx_no_feedback_timer to take into account
     that p now is either 0 (and then X_calc is irrelevant), or it is > 0; since
     the handling of TFRC_SMALLEST_P is now taken care of in the tfrc library

Justification:
--------------
 The TFRC code uses a lookup table which has a bounded resolution.
 The lowest possible value of the loss event rate `p' which can be
 resolved is currently 0.0001.  Substituting this lower threshold for
 p when p is less than 0.0001 results in a huge, exponentially-growing
 error.  The error can be computed by the following formula:

    (f(0.0001) - f(p))/f(p) * 100      for p < 0.0001

 Currently the solution is to use an (arbitrary) value
     TFRC_SMALLEST_P  =   40 * 1E-6   =   0.00004
 and to consider all values below this value as `virtually zero'.  Due to
 the exponentially growing resolution error, this is not a good idea, since
 it hides the fact that the table can not resolve practically occurring cases.
 Already at p == TFRC_SMALLEST_P, the error is as high as 58.19%!

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/dccp/ccids/ccid3.c | 22 ++++++----------------
 net/dccp/ccids/ccid3.h |  2 --
 2 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index 721efc7ed31..cf8c07b2704 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -137,8 +137,7 @@ static void ccid3_hc_tx_update_x(struct sock *sk, struct timeval *now)
 	struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
 	const __u32 old_x = hctx->ccid3hctx_x;
 
-	/* To avoid large error in calcX */
-	if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) {
+	if (hctx->ccid3hctx_p > 0) {
 		hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s,
 						     hctx->ccid3hctx_rtt,
 						     hctx->ccid3hctx_p);
@@ -226,16 +225,14 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
 				       ccid3_tx_state_name(hctx->ccid3hctx_state));
 			/* Halve sending rate */
 
-			/*  If (X_calc > 2 * X_recv)
+			/*  If (p == 0 || X_calc > 2 * X_recv)
 			 *    X_recv = max(X_recv / 2, s / (2 * t_mbi));
 			 *  Else
 			 *    X_recv = X_calc / 4;
 			 */
-			BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P &&
-			       hctx->ccid3hctx_x_calc == 0);
+			BUG_ON(hctx->ccid3hctx_p && !hctx->ccid3hctx_x_calc);
 
-			/* check also if p is zero -> x_calc is infinity? */
-			if (hctx->ccid3hctx_p < TFRC_SMALLEST_P ||
+			if (hctx->ccid3hctx_p  == 0 ||
 			    hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv)
 				hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2,
 								    hctx->ccid3hctx_s / (2 * TFRC_T_MBI));
@@ -449,15 +446,8 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
 		pinv = opt_recv->ccid3or_loss_event_rate;
 		if (pinv == ~0U || pinv == 0)
 			hctx->ccid3hctx_p = 0;
-		else {
-			hctx->ccid3hctx_p = 1000000 / pinv;
-
-			if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) {
-				hctx->ccid3hctx_p = TFRC_SMALLEST_P;
-				ccid3_pr_debug("%s, sk=%p, Smallest p used!\n",
-					       dccp_role(sk), sk);
-			}
-		}
+		else
+ 			hctx->ccid3hctx_p = 1000000 / pinv;
 
 		dccp_timestamp(sk, &now);
 
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 27cb20ae1da..07596d704ef 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -51,8 +51,6 @@
 /* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
 #define TFRC_T_MBI		   64
 
-#define TFRC_SMALLEST_P		   40
-
 enum ccid3_options {
 	TFRC_OPT_LOSS_EVENT_RATE = 192,
 	TFRC_OPT_LOSS_INTERVALS	 = 193,

From 2bbf29acd8f7adcf161de7e5d891b4095687a59f Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sun, 3 Dec 2006 14:53:27 -0200
Subject: [PATCH 10/12] [DCCP] tfrc: Binary search for reverse TFRC lookup

This replaces the linear search algorithm for reverse lookup with
binary search.

It has the advantage of better scalability: O(log2(N)) instead of O(N).
This means that the average number of iterations is reduced from 250
(linear search if each value appears equally likely) down to at most 9.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Acked-by: Ian McDonald <ian.mcdonald@jandi.co.nz>
Signed-off-by: Arnaldo Carvalho de Melo <acme@mandriva.com>
---
 net/dccp/ccids/lib/tfrc_equation.c | 38 +++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index 0a4a3d2feba..ddac2c511e2 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -594,6 +594,21 @@ static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
 	{ 243315981, 271305 }
 };
 
+/* return largest index i such that fval <= lookup[i][small] */
+static inline u32 tfrc_binsearch(u32 fval, u8 small)
+{
+	u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1;
+
+	while (low < high) {
+		try = (low + high) / 2;
+		if (fval <= tfrc_calc_x_lookup[try][small])
+			high = try;
+		else
+			low  = try + 1;
+	}
+	return high;
+}
+
 /**
  * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448
  *
@@ -656,8 +671,7 @@ EXPORT_SYMBOL_GPL(tfrc_calc_x);
  */
 u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
 {
-	int ctr = 0;
-	int small;
+	int index;
 
 	if (fvalue == 0)	/* f(p) = 0  whenever  p = 0 */
 		return 0;
@@ -672,18 +686,14 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
 		return 1000000;
 	}
 
-	if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1])
-		small = 1;
-	else
-		small = 0;
-
-	while (fvalue > tfrc_calc_x_lookup[ctr][small])
-		ctr++;
-
-	if (small)
-		return (ctr + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE;
-	else
-		return (ctr + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
+	if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) {
+		index = tfrc_binsearch(fvalue, 1);
+		return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE;
+ 	}
+ 
+	/* else ... it must be in the coarse-grained column */
+	index = tfrc_binsearch(fvalue, 0);
+	return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
 }
 
 EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);

From b4ad86bf52469b26148c677cb59d8bc81f129cc2 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Sun, 3 Dec 2006 19:19:26 -0800
Subject: [PATCH 11/12] [XFRM] xfrm_user: Better validation of user templates.

Since we never checked the ->family value of templates
before, many applications simply leave it at zero.
Detect this and fix it up to be the pol->family value.

Also, do not clobber xp->family while reading in templates,
that is not necessary.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/xfrm/xfrm_user.c | 50 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 8 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 6f97665983d..311205ffa77 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -858,7 +858,6 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
 	int i;
 
 	xp->xfrm_nr = nr;
-	xp->family = ut->family;
 	for (i = 0; i < nr; i++, ut++) {
 		struct xfrm_tmpl *t = &xp->xfrm_vec[i];
 
@@ -876,19 +875,53 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut,
 	}
 }
 
+static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family)
+{
+	int i;
+
+	if (nr > XFRM_MAX_DEPTH)
+		return -EINVAL;
+
+	for (i = 0; i < nr; i++) {
+		/* We never validated the ut->family value, so many
+		 * applications simply leave it at zero.  The check was
+		 * never made and ut->family was ignored because all
+		 * templates could be assumed to have the same family as
+		 * the policy itself.  Now that we will have ipv4-in-ipv6
+		 * and ipv6-in-ipv4 tunnels, this is no longer true.
+		 */
+		if (!ut[i].family)
+			ut[i].family = family;
+
+		switch (ut[i].family) {
+		case AF_INET:
+			break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		case AF_INET6:
+			break;
+#endif
+		default:
+			return -EINVAL;
+		};
+	}
+
+	return 0;
+}
+
 static int copy_from_user_tmpl(struct xfrm_policy *pol, struct rtattr **xfrma)
 {
 	struct rtattr *rt = xfrma[XFRMA_TMPL-1];
-	struct xfrm_user_tmpl *utmpl;
-	int nr;
 
 	if (!rt) {
 		pol->xfrm_nr = 0;
 	} else {
-		nr = (rt->rta_len - sizeof(*rt)) / sizeof(*utmpl);
+		struct xfrm_user_tmpl *utmpl = RTA_DATA(rt);
+		int nr = (rt->rta_len - sizeof(*rt)) / sizeof(*utmpl);
+		int err;
 
-		if (nr > XFRM_MAX_DEPTH)
-			return -EINVAL;
+		err = validate_tmpl(nr, utmpl, pol->family);
+		if (err)
+			return err;
 
 		copy_templates(pol, RTA_DATA(rt), nr);
 	}
@@ -1530,7 +1563,8 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, void **xf
 	}
 
 	/*   build an XP */
-	xp = xfrm_policy_construct(&ua->policy, (struct rtattr **) xfrma, &err);        if (!xp) {
+	xp = xfrm_policy_construct(&ua->policy, (struct rtattr **) xfrma, &err);
+	if (!xp) {
 		kfree(x);
 		return err;
 	}
@@ -1979,7 +2013,7 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt,
 		return NULL;
 
 	nr = ((len - sizeof(*p)) / sizeof(*ut));
-	if (nr > XFRM_MAX_DEPTH)
+	if (validate_tmpl(nr, ut, p->sel.family))
 		return NULL;
 
 	if (p->dir > XFRM_POLICY_OUT)

From ef9467f8f0803881d6b20ad6f0f770fc39bcc2c2 Mon Sep 17 00:00:00 2001
From: Jurij Smakov <jurij@wooyd.org>
Date: Sun, 3 Dec 2006 19:33:02 -0800
Subject: [PATCH 12/12] [SUNHME]: Fix for sunhme failures on x86

The following patch fixes the failure of sunhme drivers on x86 hosts
due to missing pci_enable_device() and pci_set_master() calls, lost
during code refactoring. It has been filed as bugzilla bug #7502 [0]
and Debian bug #397460 [1].

[0] http://bugzilla.kernel.org/show_bug.cgi?id=7502
[1] http://bugs.debian.org/397460

Signed-off-by: Jurij Smakov <jurij@wooyd.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/sunhme.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/sunhme.c b/drivers/net/sunhme.c
index ec432ea879f..df40e34c776 100644
--- a/drivers/net/sunhme.c
+++ b/drivers/net/sunhme.c
@@ -3012,6 +3012,11 @@ static int __devinit happy_meal_pci_probe(struct pci_dev *pdev,
 #endif
 
 	err = -ENODEV;
+
+	if (pci_enable_device(pdev))
+		goto err_out;
+	pci_set_master(pdev);
+
 	if (!strcmp(prom_name, "SUNW,qfe") || !strcmp(prom_name, "qfe")) {
 		qp = quattro_pci_find(pdev);
 		if (qp == NULL)